Mercurial > repos > devteam > annotation_profiler
comparison annotation_profiler_for_interval.py @ 0:3b33da018e74 draft default tip
Imported from capsule None
| author | devteam |
|---|---|
| date | Mon, 19 May 2014 12:33:42 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:3b33da018e74 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 #Dan Blankenberg | |
| 3 #For a set of intervals, this tool returns the same set of intervals | |
| 4 #with 2 additional fields: the name of a Table/Feature and the number of | |
| 5 #bases covered. The original intervals are repeated for each Table/Feature. | |
| 6 | |
| 7 import sys, struct, optparse, os, random | |
| 8 import bx.intervals.io | |
| 9 import bx.bitset | |
| 10 try: | |
| 11 import psyco | |
| 12 psyco.full() | |
| 13 except: | |
| 14 pass | |
| 15 | |
| 16 assert sys.version_info[:2] >= ( 2, 4 ) | |
| 17 | |
| 18 class CachedRangesInFile: | |
| 19 DEFAULT_STRUCT_FORMAT = '<I' | |
| 20 def __init__( self, filename, profiler_info ): | |
| 21 self.file_size = os.stat( filename ).st_size | |
| 22 self.file = open( filename, 'rb' ) | |
| 23 self.filename = filename | |
| 24 self.fmt = profiler_info.get( 'profiler_struct_format', self.DEFAULT_STRUCT_FORMAT ) | |
| 25 self.fmt_size = int( profiler_info.get( 'profiler_struct_size', struct.calcsize( self.fmt ) ) ) | |
| 26 self.length = int( self.file_size / self.fmt_size / 2 ) | |
| 27 self._cached_ranges = [ None for i in xrange( self.length ) ] | |
| 28 def __getitem__( self, i ): | |
| 29 if self._cached_ranges[i] is not None: | |
| 30 return self._cached_ranges[i] | |
| 31 if i < 0: i = self.length + i | |
| 32 offset = i * self.fmt_size * 2 | |
| 33 self.file.seek( offset ) | |
| 34 try: | |
| 35 start = struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0] | |
| 36 end = struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0] | |
| 37 except Exception, e: | |
| 38 raise IndexError, e | |
| 39 self._cached_ranges[i] = ( start, end ) | |
| 40 return start, end | |
| 41 def __len__( self ): | |
| 42 return self.length | |
| 43 | |
| 44 class RegionCoverage: | |
| 45 def __init__( self, filename_base, profiler_info ): | |
| 46 try: | |
| 47 self._coverage = CachedRangesInFile( "%s.covered" % filename_base, profiler_info ) | |
| 48 except Exception, e: | |
| 49 #print "Error loading coverage file %s: %s" % ( "%s.covered" % filename_base, e ) | |
| 50 self._coverage = [] | |
| 51 try: | |
| 52 self._total_coverage = int( open( "%s.total_coverage" % filename_base ).read() ) | |
| 53 except Exception, e: | |
| 54 #print "Error loading total coverage file %s: %s" % ( "%s.total_coverage" % filename_base, e ) | |
| 55 self._total_coverage = 0 | |
| 56 def get_start_index( self, start ): | |
| 57 #binary search: returns index of range closest to start | |
| 58 if start > self._coverage[-1][1]: | |
| 59 return len( self._coverage ) - 1 | |
| 60 i = 0 | |
| 61 j = len( self._coverage) - 1 | |
| 62 while i < j: | |
| 63 k = ( i + j ) / 2 | |
| 64 if start <= self._coverage[k][1]: | |
| 65 j = k | |
| 66 else: | |
| 67 i = k + 1 | |
| 68 return i | |
| 69 def get_coverage( self, start, end ): | |
| 70 return self.get_coverage_regions_overlap( start, end )[0] | |
| 71 def get_coverage_regions_overlap( self, start, end ): | |
| 72 return self.get_coverage_regions_index_overlap( start, end )[0:2] | |
| 73 def get_coverage_regions_index_overlap( self, start, end ): | |
| 74 if len( self._coverage ) < 1 or start > self._coverage[-1][1] or end < self._coverage[0][0]: | |
| 75 return 0, 0, 0 | |
| 76 if self._total_coverage and start <= self._coverage[0][0] and end >= self._coverage[-1][1]: | |
| 77 return self._total_coverage, len( self._coverage ), 0 | |
| 78 coverage = 0 | |
| 79 region_count = 0 | |
| 80 start_index = self.get_start_index( start ) | |
| 81 for i in xrange( start_index, len( self._coverage ) ): | |
| 82 c_start, c_end = self._coverage[i] | |
| 83 if c_start > end: | |
| 84 break | |
| 85 if c_start <= end and c_end >= start: | |
| 86 coverage += min( end, c_end ) - max( start, c_start ) | |
| 87 region_count += 1 | |
| 88 return coverage, region_count, start_index | |
| 89 | |
| 90 class CachedCoverageReader: | |
| 91 def __init__( self, base_file_path, buffer = 10, table_names = None, profiler_info = None ): | |
| 92 self._base_file_path = base_file_path | |
| 93 self._buffer = buffer #number of chromosomes to keep in memory at a time | |
| 94 self._coverage = {} | |
| 95 if table_names is None: table_names = [ table_dir for table_dir in os.listdir( self._base_file_path ) if os.path.isdir( os.path.join( self._base_file_path, table_dir ) ) ] | |
| 96 for tablename in table_names: self._coverage[tablename] = {} | |
| 97 if profiler_info is None: profiler_info = {} | |
| 98 self._profiler_info = profiler_info | |
| 99 def iter_table_coverage_by_region( self, chrom, start, end ): | |
| 100 for tablename, coverage, regions in self.iter_table_coverage_regions_by_region( chrom, start, end ): | |
| 101 yield tablename, coverage | |
| 102 def iter_table_coverage_regions_by_region( self, chrom, start, end ): | |
| 103 for tablename, coverage, regions, index in self.iter_table_coverage_regions_index_by_region( chrom, start, end ): | |
| 104 yield tablename, coverage, regions | |
| 105 def iter_table_coverage_regions_index_by_region( self, chrom, start, end ): | |
| 106 for tablename, chromosomes in self._coverage.iteritems(): | |
| 107 if chrom not in chromosomes: | |
| 108 if len( chromosomes ) >= self._buffer: | |
| 109 #randomly remove one chromosome from this table | |
| 110 del chromosomes[ chromosomes.keys().pop( random.randint( 0, self._buffer - 1 ) ) ] | |
| 111 chromosomes[chrom] = RegionCoverage( os.path.join ( self._base_file_path, tablename, chrom ), self._profiler_info ) | |
| 112 coverage, regions, index = chromosomes[chrom].get_coverage_regions_index_overlap( start, end ) | |
| 113 yield tablename, coverage, regions, index | |
| 114 | |
| 115 class TableCoverageSummary: | |
| 116 def __init__( self, coverage_reader, chrom_lengths ): | |
| 117 self.coverage_reader = coverage_reader | |
| 118 self.chrom_lengths = chrom_lengths | |
| 119 self.chromosome_coverage = {} #dict of bitset by chromosome holding user's collapsed input intervals | |
| 120 self.total_interval_size = 0 #total size of user's input intervals | |
| 121 self.total_interval_count = 0 #total number of user's input intervals | |
| 122 self.table_coverage = {} #dict of total coverage by user's input intervals by table | |
| 123 self.table_chromosome_size = {} #dict of dict of table:chrom containing total coverage of table for a chrom | |
| 124 self.table_chromosome_count = {} #dict of dict of table:chrom containing total number of coverage ranges of table for a chrom | |
| 125 self.table_regions_overlaped_count = {} #total number of table regions overlaping user's input intervals (non unique) | |
| 126 self.interval_table_overlap_count = {} #total number of user input intervals which overlap table | |
| 127 self.region_size_errors = {} #dictionary of lists of invalid ranges by chromosome | |
| 128 def add_region( self, chrom, start, end ): | |
| 129 chrom_length = self.chrom_lengths.get( chrom ) | |
| 130 region_start = min( start, chrom_length ) | |
| 131 region_end = min( end, chrom_length ) | |
| 132 region_length = region_end - region_start | |
| 133 | |
| 134 if region_length < 1 or region_start != start or region_end != end: | |
| 135 if chrom not in self.region_size_errors: | |
| 136 self.region_size_errors[chrom] = [] | |
| 137 self.region_size_errors[chrom].append( ( start, end ) ) | |
| 138 if region_length < 1: return | |
| 139 | |
| 140 self.total_interval_size += region_length | |
| 141 self.total_interval_count += 1 | |
| 142 if chrom not in self.chromosome_coverage: | |
| 143 self.chromosome_coverage[chrom] = bx.bitset.BitSet( chrom_length ) | |
| 144 | |
| 145 self.chromosome_coverage[chrom].set_range( region_start, region_length ) | |
| 146 for table_name, coverage, regions in self.coverage_reader.iter_table_coverage_regions_by_region( chrom, region_start, region_end ): | |
| 147 if table_name not in self.table_coverage: | |
| 148 self.table_coverage[table_name] = 0 | |
| 149 self.table_chromosome_size[table_name] = {} | |
| 150 self.table_regions_overlaped_count[table_name] = 0 | |
| 151 self.interval_table_overlap_count[table_name] = 0 | |
| 152 self.table_chromosome_count[table_name] = {} | |
| 153 if chrom not in self.table_chromosome_size[table_name]: | |
| 154 self.table_chromosome_size[table_name][chrom] = self.coverage_reader._coverage[table_name][chrom]._total_coverage | |
| 155 self.table_chromosome_count[table_name][chrom] = len( self.coverage_reader._coverage[table_name][chrom]._coverage ) | |
| 156 self.table_coverage[table_name] += coverage | |
| 157 if coverage: | |
| 158 self.interval_table_overlap_count[table_name] += 1 | |
| 159 self.table_regions_overlaped_count[table_name] += regions | |
| 160 def iter_table_coverage( self ): | |
| 161 def get_nr_coverage(): | |
| 162 #returns non-redundant coverage, where user's input intervals have been collapse to resolve overlaps | |
| 163 table_coverage = {} #dictionary of tables containing number of table bases overlaped by nr intervals | |
| 164 interval_table_overlap_count = {} #dictionary of tables containing number of nr intervals overlaping table | |
| 165 table_regions_overlap_count = {} #dictionary of tables containing number of regions overlaped (unique) | |
| 166 interval_count = 0 #total number of nr intervals | |
| 167 interval_size = 0 #holds total size of nr intervals | |
| 168 region_start_end = {} #holds absolute start,end for each user input chromosome | |
| 169 for chrom, chromosome_bitset in self.chromosome_coverage.iteritems(): | |
| 170 #loop through user's collapsed input intervals | |
| 171 end = 0 | |
| 172 last_end_index = {} | |
| 173 interval_size += chromosome_bitset.count_range() | |
| 174 while True: | |
| 175 if end >= chromosome_bitset.size: break | |
| 176 start = chromosome_bitset.next_set( end ) | |
| 177 if start >= chromosome_bitset.size: break | |
| 178 end = chromosome_bitset.next_clear( start ) | |
| 179 interval_count += 1 | |
| 180 if chrom not in region_start_end: | |
| 181 region_start_end[chrom] = [start, end] | |
| 182 else: | |
| 183 region_start_end[chrom][1] = end | |
| 184 for table_name, coverage, region_count, start_index in self.coverage_reader.iter_table_coverage_regions_index_by_region( chrom, start, end ): | |
| 185 if table_name not in table_coverage: | |
| 186 table_coverage[table_name] = 0 | |
| 187 interval_table_overlap_count[table_name] = 0 | |
| 188 table_regions_overlap_count[table_name] = 0 | |
| 189 table_coverage[table_name] += coverage | |
| 190 if coverage: | |
| 191 interval_table_overlap_count[table_name] += 1 | |
| 192 table_regions_overlap_count[table_name] += region_count | |
| 193 if table_name in last_end_index and last_end_index[table_name] == start_index: | |
| 194 table_regions_overlap_count[table_name] -= 1 | |
| 195 last_end_index[table_name] = start_index + region_count - 1 | |
| 196 table_region_coverage = {} #total coverage for tables by bounding nr interval region | |
| 197 table_region_count = {} #total number for tables by bounding nr interval region | |
| 198 for chrom, start_end in region_start_end.items(): | |
| 199 for table_name, coverage, region_count in self.coverage_reader.iter_table_coverage_regions_by_region( chrom, start_end[0], start_end[1] ): | |
| 200 if table_name not in table_region_coverage: | |
| 201 table_region_coverage[table_name] = 0 | |
| 202 table_region_count[table_name] = 0 | |
| 203 table_region_coverage[table_name] += coverage | |
| 204 table_region_count[table_name] += region_count | |
| 205 return table_region_coverage, table_region_count, interval_count, interval_size, table_coverage, table_regions_overlap_count, interval_table_overlap_count | |
| 206 table_region_coverage, table_region_count, nr_interval_count, nr_interval_size, nr_table_coverage, nr_table_regions_overlap_count, nr_interval_table_overlap_count = get_nr_coverage() | |
| 207 for table_name in self.table_coverage: | |
| 208 #TODO: determine a type of statistic, then calculate and report here | |
| 209 yield table_name, sum( self.table_chromosome_size.get( table_name, {} ).values() ), sum( self.table_chromosome_count.get( table_name, {} ).values() ), table_region_coverage.get( table_name, 0 ), table_region_count.get( table_name, 0 ), self.total_interval_count, self.total_interval_size, self.table_coverage[table_name], self.table_regions_overlaped_count.get( table_name, 0), self.interval_table_overlap_count.get( table_name, 0 ), nr_interval_count, nr_interval_size, nr_table_coverage[table_name], nr_table_regions_overlap_count.get( table_name, 0 ), nr_interval_table_overlap_count.get( table_name, 0 ) | |
| 210 | |
| 211 def profile_per_interval( interval_filename, chrom_col, start_col, end_col, out_filename, keep_empty, coverage_reader ): | |
| 212 out = open( out_filename, 'wb' ) | |
| 213 for region in bx.intervals.io.NiceReaderWrapper( open( interval_filename, 'rb' ), chrom_col = chrom_col, start_col = start_col, end_col = end_col, fix_strand = True, return_header = False, return_comments = False ): | |
| 214 for table_name, coverage, region_count in coverage_reader.iter_table_coverage_regions_by_region( region.chrom, region.start, region.end ): | |
| 215 if keep_empty or coverage: | |
| 216 #only output regions that have atleast 1 base covered unless empty are requested | |
| 217 out.write( "%s\t%s\t%s\t%s\n" % ( "\t".join( region.fields ), table_name, coverage, region_count ) ) | |
| 218 out.close() | |
| 219 | |
| 220 def profile_summary( interval_filename, chrom_col, start_col, end_col, out_filename, keep_empty, coverage_reader, chrom_lengths ): | |
| 221 out = open( out_filename, 'wb' ) | |
| 222 table_coverage_summary = TableCoverageSummary( coverage_reader, chrom_lengths ) | |
| 223 for region in bx.intervals.io.NiceReaderWrapper( open( interval_filename, 'rb' ), chrom_col = chrom_col, start_col = start_col, end_col = end_col, fix_strand = True, return_header = False, return_comments = False ): | |
| 224 table_coverage_summary.add_region( region.chrom, region.start, region.end ) | |
| 225 | |
| 226 out.write( "#tableName\ttableChromosomeCoverage\ttableChromosomeCount\ttableRegionCoverage\ttableRegionCount\tallIntervalCount\tallIntervalSize\tallCoverage\tallTableRegionsOverlaped\tallIntervalsOverlapingTable\tnrIntervalCount\tnrIntervalSize\tnrCoverage\tnrTableRegionsOverlaped\tnrIntervalsOverlapingTable\n" ) | |
| 227 for table_name, table_chromosome_size, table_chromosome_count, table_region_coverage, table_region_count, total_interval_count, total_interval_size, total_coverage, table_regions_overlaped_count, interval_region_overlap_count, nr_interval_count, nr_interval_size, nr_coverage, nr_table_regions_overlaped_count, nr_interval_table_overlap_count in table_coverage_summary.iter_table_coverage(): | |
| 228 if keep_empty or total_coverage: | |
| 229 #only output tables that have atleast 1 base covered unless empty are requested | |
| 230 out.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( table_name, table_chromosome_size, table_chromosome_count, table_region_coverage, table_region_count, total_interval_count, total_interval_size, total_coverage, table_regions_overlaped_count, interval_region_overlap_count, nr_interval_count, nr_interval_size, nr_coverage, nr_table_regions_overlaped_count, nr_interval_table_overlap_count ) ) | |
| 231 out.close() | |
| 232 | |
| 233 #report chrom size errors as needed: | |
| 234 if table_coverage_summary.region_size_errors: | |
| 235 print "Regions provided extended beyond known chromosome lengths, and have been truncated as necessary, for the following intervals:" | |
| 236 for chrom, regions in table_coverage_summary.region_size_errors.items(): | |
| 237 if len( regions ) > 3: | |
| 238 extra_region_info = ", ... " | |
| 239 else: | |
| 240 extra_region_info = "" | |
| 241 print "%s has max length of %s, exceeded by %s%s." % ( chrom, chrom_lengths.get( chrom ), ", ".join( map( str, regions[:3] ) ), extra_region_info ) | |
| 242 | |
| 243 class ChromosomeLengths: | |
| 244 def __init__( self, profiler_info ): | |
| 245 self.chroms = {} | |
| 246 self.default_bitset_size = int( profiler_info.get( 'bitset_size', bx.bitset.MAX ) ) | |
| 247 chroms = profiler_info.get( 'chromosomes', None ) | |
| 248 if chroms: | |
| 249 for chrom in chroms.split( ',' ): | |
| 250 for fields in chrom.rsplit( '=', 1 ): | |
| 251 if len( fields ) == 2: | |
| 252 self.chroms[ fields[0] ] = int( fields[1] ) | |
| 253 else: | |
| 254 self.chroms[ fields[0] ] = self.default_bitset_size | |
| 255 def get( self, name ): | |
| 256 return self.chroms.get( name, self.default_bitset_size ) | |
| 257 | |
| 258 def parse_profiler_info( filename ): | |
| 259 profiler_info = {} | |
| 260 try: | |
| 261 for line in open( filename ): | |
| 262 fields = line.rstrip( '\n\r' ).split( '\t', 1 ) | |
| 263 if len( fields ) == 2: | |
| 264 if fields[0] in profiler_info: | |
| 265 if not isinstance( profiler_info[ fields[0] ], list ): | |
| 266 profiler_info[ fields[0] ] = [ profiler_info[ fields[0] ] ] | |
| 267 profiler_info[ fields[0] ].append( fields[1] ) | |
| 268 else: | |
| 269 profiler_info[ fields[0] ] = fields[1] | |
| 270 except: | |
| 271 pass #likely missing file | |
| 272 return profiler_info | |
| 273 | |
| 274 def __main__(): | |
| 275 parser = optparse.OptionParser() | |
| 276 parser.add_option( | |
| 277 '-k','--keep_empty', | |
| 278 action="store_true", | |
| 279 dest='keep_empty', | |
| 280 default=False, | |
| 281 help='Keep tables with 0 coverage' | |
| 282 ) | |
| 283 parser.add_option( | |
| 284 '-b','--buffer', | |
| 285 dest='buffer', | |
| 286 type='int',default=10, | |
| 287 help='Number of Chromosomes to keep buffered' | |
| 288 ) | |
| 289 parser.add_option( | |
| 290 '-c','--chrom_col', | |
| 291 dest='chrom_col', | |
| 292 type='int',default=1, | |
| 293 help='Chromosome column' | |
| 294 ) | |
| 295 parser.add_option( | |
| 296 '-s','--start_col', | |
| 297 dest='start_col', | |
| 298 type='int',default=2, | |
| 299 help='Start Column' | |
| 300 ) | |
| 301 parser.add_option( | |
| 302 '-e','--end_col', | |
| 303 dest='end_col', | |
| 304 type='int',default=3, | |
| 305 help='End Column' | |
| 306 ) | |
| 307 parser.add_option( | |
| 308 '-p','--path', | |
| 309 dest='path', | |
| 310 type='str',default='/galaxy/data/annotation_profiler/hg18', | |
| 311 help='Path to profiled data for this organism' | |
| 312 ) | |
| 313 parser.add_option( | |
| 314 '-t','--table_names', | |
| 315 dest='table_names', | |
| 316 type='str',default='None', | |
| 317 help='Table names requested' | |
| 318 ) | |
| 319 parser.add_option( | |
| 320 '-i','--input', | |
| 321 dest='interval_filename', | |
| 322 type='str', | |
| 323 help='Input Interval File' | |
| 324 ) | |
| 325 parser.add_option( | |
| 326 '-o','--output', | |
| 327 dest='out_filename', | |
| 328 type='str', | |
| 329 help='Input Interval File' | |
| 330 ) | |
| 331 parser.add_option( | |
| 332 '-S','--summary', | |
| 333 action="store_true", | |
| 334 dest='summary', | |
| 335 default=False, | |
| 336 help='Display Summary Results' | |
| 337 ) | |
| 338 | |
| 339 options, args = parser.parse_args() | |
| 340 | |
| 341 assert os.path.isdir( options.path ), IOError( "Configuration error: Table directory is missing (%s)" % options.path ) | |
| 342 | |
| 343 #get profiler_info | |
| 344 profiler_info = parse_profiler_info( os.path.join( options.path, 'profiler_info.txt' ) ) | |
| 345 | |
| 346 table_names = options.table_names.split( "," ) | |
| 347 if table_names == ['None']: table_names = None | |
| 348 coverage_reader = CachedCoverageReader( options.path, buffer = options.buffer, table_names = table_names, profiler_info = profiler_info ) | |
| 349 | |
| 350 if options.summary: | |
| 351 profile_summary( options.interval_filename, options.chrom_col - 1, options.start_col - 1, options.end_col -1, options.out_filename, options.keep_empty, coverage_reader, ChromosomeLengths( profiler_info ) ) | |
| 352 else: | |
| 353 profile_per_interval( options.interval_filename, options.chrom_col - 1, options.start_col - 1, options.end_col -1, options.out_filename, options.keep_empty, coverage_reader ) | |
| 354 | |
| 355 #print out data version info | |
| 356 print 'Data version (%s:%s:%s)' % ( profiler_info.get( 'dbkey', 'unknown' ), profiler_info.get( 'profiler_hash', 'unknown' ), profiler_info.get( 'dump_time', 'unknown' ) ) | |
| 357 | |
| 358 if __name__ == "__main__": __main__() |
