Mercurial > repos > devteam > fasta_concatenate_by_species
comparison utils/maf_utilities.py @ 0:2126e1b833a2
Imported from capsule None
| author | devteam |
|---|---|
| date | Mon, 19 May 2014 12:33:30 -0400 |
| parents | |
| children | 717aee069681 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:2126e1b833a2 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """ | |
| 3 Provides wrappers and utilities for working with MAF files and alignments. | |
| 4 """ | |
| 5 #Dan Blankenberg | |
| 6 import pkg_resources; pkg_resources.require( "bx-python" ) | |
| 7 import bx.align.maf | |
| 8 import bx.intervals | |
| 9 import bx.interval_index_file | |
| 10 import sys, os, string, tempfile | |
| 11 import logging | |
| 12 from copy import deepcopy | |
| 13 | |
| 14 assert sys.version_info[:2] >= ( 2, 4 ) | |
| 15 | |
| 16 log = logging.getLogger(__name__) | |
| 17 | |
| 18 | |
| 19 GAP_CHARS = [ '-' ] | |
| 20 SRC_SPLIT_CHAR = '.' | |
| 21 | |
| 22 def src_split( src ): | |
| 23 fields = src.split( SRC_SPLIT_CHAR, 1 ) | |
| 24 spec = fields.pop( 0 ) | |
| 25 if fields: | |
| 26 chrom = fields.pop( 0 ) | |
| 27 else: | |
| 28 chrom = spec | |
| 29 return spec, chrom | |
| 30 | |
| 31 def src_merge( spec, chrom, contig = None ): | |
| 32 if None in [ spec, chrom ]: | |
| 33 spec = chrom = spec or chrom | |
| 34 return bx.align.maf.src_merge( spec, chrom, contig ) | |
| 35 | |
| 36 def get_species_in_block( block ): | |
| 37 species = [] | |
| 38 for c in block.components: | |
| 39 spec, chrom = src_split( c.src ) | |
| 40 if spec not in species: | |
| 41 species.append( spec ) | |
| 42 return species | |
| 43 | |
| 44 def tool_fail( msg = "Unknown Error" ): | |
| 45 print >> sys.stderr, "Fatal Error: %s" % msg | |
| 46 sys.exit() | |
| 47 | |
| 48 #an object corresponding to a reference layered alignment | |
| 49 class RegionAlignment( object ): | |
| 50 | |
| 51 DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" ) | |
| 52 MAX_SEQUENCE_SIZE = sys.maxint #Maximum length of sequence allowed | |
| 53 | |
| 54 def __init__( self, size, species = [] ): | |
| 55 assert size <= self.MAX_SEQUENCE_SIZE, "Maximum length allowed for an individual sequence has been exceeded (%i > %i)." % ( size, self.MAX_SEQUENCE_SIZE ) | |
| 56 self.size = size | |
| 57 self.sequences = {} | |
| 58 if not isinstance( species, list ): | |
| 59 species = [species] | |
| 60 for spec in species: | |
| 61 self.add_species( spec ) | |
| 62 | |
| 63 #add a species to the alignment | |
| 64 def add_species( self, species ): | |
| 65 #make temporary sequence files | |
| 66 self.sequences[species] = tempfile.TemporaryFile() | |
| 67 self.sequences[species].write( "-" * self.size ) | |
| 68 | |
| 69 #returns the names for species found in alignment, skipping names as requested | |
| 70 def get_species_names( self, skip = [] ): | |
| 71 if not isinstance( skip, list ): skip = [skip] | |
| 72 names = self.sequences.keys() | |
| 73 for name in skip: | |
| 74 try: names.remove( name ) | |
| 75 except: pass | |
| 76 return names | |
| 77 | |
| 78 #returns the sequence for a species | |
| 79 def get_sequence( self, species ): | |
| 80 self.sequences[species].seek( 0 ) | |
| 81 return self.sequences[species].read() | |
| 82 | |
| 83 #returns the reverse complement of the sequence for a species | |
| 84 def get_sequence_reverse_complement( self, species ): | |
| 85 complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )] | |
| 86 complement.reverse() | |
| 87 return "".join( complement ) | |
| 88 | |
| 89 #sets a position for a species | |
| 90 def set_position( self, index, species, base ): | |
| 91 if len( base ) != 1: raise Exception( "A genomic position can only have a length of 1." ) | |
| 92 return self.set_range( index, species, base ) | |
| 93 #sets a range for a species | |
| 94 def set_range( self, index, species, bases ): | |
| 95 if index >= self.size or index < 0: raise Exception( "Your index (%i) is out of range (0 - %i)." % ( index, self.size - 1 ) ) | |
| 96 if len( bases ) == 0: raise Exception( "A set of genomic positions can only have a positive length." ) | |
| 97 if species not in self.sequences.keys(): self.add_species( species ) | |
| 98 self.sequences[species].seek( index ) | |
| 99 self.sequences[species].write( bases ) | |
| 100 | |
| 101 #Flush temp file of specified species, or all species | |
| 102 def flush( self, species = None ): | |
| 103 if species is None: | |
| 104 species = self.sequences.keys() | |
| 105 elif not isinstance( species, list ): | |
| 106 species = [species] | |
| 107 for spec in species: | |
| 108 self.sequences[spec].flush() | |
| 109 | |
| 110 class GenomicRegionAlignment( RegionAlignment ): | |
| 111 | |
| 112 def __init__( self, start, end, species = [] ): | |
| 113 RegionAlignment.__init__( self, end - start, species ) | |
| 114 self.start = start | |
| 115 self.end = end | |
| 116 | |
| 117 class SplicedAlignment( object ): | |
| 118 | |
| 119 DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" ) | |
| 120 | |
| 121 def __init__( self, exon_starts, exon_ends, species = [] ): | |
| 122 if not isinstance( exon_starts, list ): | |
| 123 exon_starts = [exon_starts] | |
| 124 if not isinstance( exon_ends, list ): | |
| 125 exon_ends = [exon_ends] | |
| 126 assert len( exon_starts ) == len( exon_ends ), "The number of starts does not match the number of sizes." | |
| 127 self.exons = [] | |
| 128 for i in range( len( exon_starts ) ): | |
| 129 self.exons.append( GenomicRegionAlignment( exon_starts[i], exon_ends[i], species ) ) | |
| 130 | |
| 131 #returns the names for species found in alignment, skipping names as requested | |
| 132 def get_species_names( self, skip = [] ): | |
| 133 if not isinstance( skip, list ): skip = [skip] | |
| 134 names = [] | |
| 135 for exon in self.exons: | |
| 136 for name in exon.get_species_names( skip = skip ): | |
| 137 if name not in names: | |
| 138 names.append( name ) | |
| 139 return names | |
| 140 | |
| 141 #returns the sequence for a species | |
| 142 def get_sequence( self, species ): | |
| 143 sequence = tempfile.TemporaryFile() | |
| 144 for exon in self.exons: | |
| 145 if species in exon.get_species_names(): | |
| 146 sequence.write( exon.get_sequence( species ) ) | |
| 147 else: | |
| 148 sequence.write( "-" * exon.size ) | |
| 149 sequence.seek( 0 ) | |
| 150 return sequence.read() | |
| 151 | |
| 152 #returns the reverse complement of the sequence for a species | |
| 153 def get_sequence_reverse_complement( self, species ): | |
| 154 complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )] | |
| 155 complement.reverse() | |
| 156 return "".join( complement ) | |
| 157 | |
| 158 #Start and end of coding region | |
| 159 @property | |
| 160 def start( self ): | |
| 161 return self.exons[0].start | |
| 162 @property | |
| 163 def end( self ): | |
| 164 return self.exons[-1].end | |
| 165 | |
| 166 #Open a MAF index using a UID | |
| 167 def maf_index_by_uid( maf_uid, index_location_file ): | |
| 168 for line in open( index_location_file ): | |
| 169 try: | |
| 170 #read each line, if not enough fields, go to next line | |
| 171 if line[0:1] == "#" : continue | |
| 172 fields = line.split('\t') | |
| 173 if maf_uid == fields[1]: | |
| 174 try: | |
| 175 maf_files = fields[4].replace( "\n", "" ).replace( "\r", "" ).split( "," ) | |
| 176 return bx.align.maf.MultiIndexed( maf_files, keep_open = True, parse_e_rows = False ) | |
| 177 except Exception, e: | |
| 178 raise Exception( 'MAF UID (%s) found, but configuration appears to be malformed: %s' % ( maf_uid, e ) ) | |
| 179 except: | |
| 180 pass | |
| 181 return None | |
| 182 | |
| 183 #return ( index, temp_index_filename ) for user maf, if available, or build one and return it, return None when no tempfile is created | |
| 184 def open_or_build_maf_index( maf_file, index_filename, species = None ): | |
| 185 try: | |
| 186 return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), None ) | |
| 187 except: | |
| 188 return build_maf_index( maf_file, species = species ) | |
| 189 | |
| 190 def build_maf_index_species_chromosomes( filename, index_species = None ): | |
| 191 species = [] | |
| 192 species_chromosomes = {} | |
| 193 indexes = bx.interval_index_file.Indexes() | |
| 194 blocks = 0 | |
| 195 try: | |
| 196 maf_reader = bx.align.maf.Reader( open( filename ) ) | |
| 197 while True: | |
| 198 pos = maf_reader.file.tell() | |
| 199 block = maf_reader.next() | |
| 200 if block is None: | |
| 201 break | |
| 202 blocks += 1 | |
| 203 for c in block.components: | |
| 204 spec = c.src | |
| 205 chrom = None | |
| 206 if "." in spec: | |
| 207 spec, chrom = spec.split( ".", 1 ) | |
| 208 if spec not in species: | |
| 209 species.append( spec ) | |
| 210 species_chromosomes[spec] = [] | |
| 211 if chrom and chrom not in species_chromosomes[spec]: | |
| 212 species_chromosomes[spec].append( chrom ) | |
| 213 if index_species is None or spec in index_species: | |
| 214 forward_strand_start = c.forward_strand_start | |
| 215 forward_strand_end = c.forward_strand_end | |
| 216 try: | |
| 217 forward_strand_start = int( forward_strand_start ) | |
| 218 forward_strand_end = int( forward_strand_end ) | |
| 219 except ValueError: | |
| 220 continue #start and end are not integers, can't add component to index, goto next component | |
| 221 #this likely only occurs when parse_e_rows is True? | |
| 222 #could a species exist as only e rows? should the | |
| 223 if forward_strand_end > forward_strand_start: | |
| 224 #require positive length; i.e. certain lines have start = end = 0 and cannot be indexed | |
| 225 indexes.add( c.src, forward_strand_start, forward_strand_end, pos, max=c.src_size ) | |
| 226 except Exception, e: | |
| 227 #most likely a bad MAF | |
| 228 log.debug( 'Building MAF index on %s failed: %s' % ( filename, e ) ) | |
| 229 return ( None, [], {}, 0 ) | |
| 230 return ( indexes, species, species_chromosomes, blocks ) | |
| 231 | |
| 232 #builds and returns ( index, index_filename ) for specified maf_file | |
| 233 def build_maf_index( maf_file, species = None ): | |
| 234 indexes, found_species, species_chromosomes, blocks = build_maf_index_species_chromosomes( maf_file, species ) | |
| 235 if indexes is not None: | |
| 236 fd, index_filename = tempfile.mkstemp() | |
| 237 out = os.fdopen( fd, 'w' ) | |
| 238 indexes.write( out ) | |
| 239 out.close() | |
| 240 return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), index_filename ) | |
| 241 return ( None, None ) | |
| 242 | |
| 243 def component_overlaps_region( c, region ): | |
| 244 if c is None: return False | |
| 245 start, end = c.get_forward_strand_start(), c.get_forward_strand_end() | |
| 246 if region.start >= end or region.end <= start: | |
| 247 return False | |
| 248 return True | |
| 249 | |
| 250 def chop_block_by_region( block, src, region, species = None, mincols = 0 ): | |
| 251 # This chopping method was designed to maintain consistency with how start/end padding gaps have been working in Galaxy thus far: | |
| 252 # behavior as seen when forcing blocks to be '+' relative to src sequence (ref) and using block.slice_by_component( ref, slice_start, slice_end ) | |
| 253 # whether-or-not this is the 'correct' behavior is questionable, but this will at least maintain consistency | |
| 254 # comments welcome | |
| 255 slice_start = block.text_size #max for the min() | |
| 256 slice_end = 0 #min for the max() | |
| 257 old_score = block.score #save old score for later use | |
| 258 # We no longer assume only one occurance of src per block, so we need to check them all | |
| 259 for c in iter_components_by_src( block, src ): | |
| 260 if component_overlaps_region( c, region ): | |
| 261 if c.text is not None: | |
| 262 rev_strand = False | |
| 263 if c.strand == "-": | |
| 264 #We want our coord_to_col coordinates to be returned from positive stranded component | |
| 265 rev_strand = True | |
| 266 c = c.reverse_complement() | |
| 267 start = max( region.start, c.start ) | |
| 268 end = min( region.end, c.end ) | |
| 269 start = c.coord_to_col( start ) | |
| 270 end = c.coord_to_col( end ) | |
| 271 if rev_strand: | |
| 272 #need to orient slice coordinates to the original block direction | |
| 273 slice_len = end - start | |
| 274 end = len( c.text ) - start | |
| 275 start = end - slice_len | |
| 276 slice_start = min( start, slice_start ) | |
| 277 slice_end = max( end, slice_end ) | |
| 278 | |
| 279 if slice_start < slice_end: | |
| 280 block = block.slice( slice_start, slice_end ) | |
| 281 if block.text_size > mincols: | |
| 282 # restore old score, may not be accurate, but it is better than 0 for everything? | |
| 283 block.score = old_score | |
| 284 if species is not None: | |
| 285 block = block.limit_to_species( species ) | |
| 286 block.remove_all_gap_columns() | |
| 287 return block | |
| 288 return None | |
| 289 | |
| 290 def orient_block_by_region( block, src, region, force_strand = None ): | |
| 291 #loop through components matching src, | |
| 292 #make sure each of these components overlap region | |
| 293 #cache strand for each of overlaping regions | |
| 294 #if force_strand / region.strand not in strand cache, reverse complement | |
| 295 ### we could have 2 sequences with same src, overlapping region, on different strands, this would cause no reverse_complementing | |
| 296 strands = [ c.strand for c in iter_components_by_src( block, src ) if component_overlaps_region( c, region ) ] | |
| 297 if strands and ( force_strand is None and region.strand not in strands ) or ( force_strand is not None and force_strand not in strands ): | |
| 298 block = block.reverse_complement() | |
| 299 return block | |
| 300 | |
| 301 def get_oriented_chopped_blocks_for_region( index, src, region, species = None, mincols = 0, force_strand = None ): | |
| 302 for block, idx, offset in get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols, force_strand ): | |
| 303 yield block | |
| 304 def get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0, force_strand = None ): | |
| 305 for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ): | |
| 306 yield orient_block_by_region( block, src, region, force_strand ), idx, offset | |
| 307 | |
| 308 #split a block with multiple occurances of src into one block per src | |
| 309 def iter_blocks_split_by_src( block, src ): | |
| 310 for src_c in iter_components_by_src( block, src ): | |
| 311 new_block = bx.align.Alignment( score=block.score, attributes=deepcopy( block.attributes ) ) | |
| 312 new_block.text_size = block.text_size | |
| 313 for c in block.components: | |
| 314 if c == src_c or c.src != src: | |
| 315 new_block.add_component( deepcopy( c ) ) #components have reference to alignment, dont want to loose reference to original alignment block in original components | |
| 316 yield new_block | |
| 317 | |
| 318 #split a block into multiple blocks with all combinations of a species appearing only once per block | |
| 319 def iter_blocks_split_by_species( block, species = None ): | |
| 320 def __split_components_by_species( components_by_species, new_block ): | |
| 321 if components_by_species: | |
| 322 #more species with components to add to this block | |
| 323 components_by_species = deepcopy( components_by_species ) | |
| 324 spec_comps = components_by_species.pop( 0 ) | |
| 325 for c in spec_comps: | |
| 326 newer_block = deepcopy( new_block ) | |
| 327 newer_block.add_component( deepcopy( c ) ) | |
| 328 for value in __split_components_by_species( components_by_species, newer_block ): | |
| 329 yield value | |
| 330 else: | |
| 331 #no more components to add, yield this block | |
| 332 yield new_block | |
| 333 | |
| 334 #divide components by species | |
| 335 spec_dict = {} | |
| 336 if not species: | |
| 337 species = [] | |
| 338 for c in block.components: | |
| 339 spec, chrom = src_split( c.src ) | |
| 340 if spec not in spec_dict: | |
| 341 spec_dict[ spec ] = [] | |
| 342 species.append( spec ) | |
| 343 spec_dict[ spec ].append( c ) | |
| 344 else: | |
| 345 for spec in species: | |
| 346 spec_dict[ spec ] = [] | |
| 347 for c in iter_components_by_src_start( block, spec ): | |
| 348 spec_dict[ spec ].append( c ) | |
| 349 | |
| 350 empty_block = bx.align.Alignment( score=block.score, attributes=deepcopy( block.attributes ) ) #should we copy attributes? | |
| 351 empty_block.text_size = block.text_size | |
| 352 #call recursive function to split into each combo of spec/blocks | |
| 353 for value in __split_components_by_species( spec_dict.values(), empty_block ): | |
| 354 sort_block_components_by_block( value, block ) #restore original component order | |
| 355 yield value | |
| 356 | |
| 357 | |
| 358 #generator yielding only chopped and valid blocks for a specified region | |
| 359 def get_chopped_blocks_for_region( index, src, region, species = None, mincols = 0 ): | |
| 360 for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ): | |
| 361 yield block | |
| 362 def get_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0 ): | |
| 363 for block, idx, offset in index.get_as_iterator_with_index_and_offset( src, region.start, region.end ): | |
| 364 block = chop_block_by_region( block, src, region, species, mincols ) | |
| 365 if block is not None: | |
| 366 yield block, idx, offset | |
| 367 | |
| 368 #returns a filled region alignment for specified regions | |
| 369 def get_region_alignment( index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ): | |
| 370 if species is not None: alignment = RegionAlignment( end - start, species ) | |
| 371 else: alignment = RegionAlignment( end - start, primary_species ) | |
| 372 return fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand, species, mincols, overwrite_with_gaps ) | |
| 373 | |
| 374 #reduces a block to only positions exisiting in the src provided | |
| 375 def reduce_block_by_primary_genome( block, species, chromosome, region_start ): | |
| 376 #returns ( startIndex, {species:texts} | |
| 377 #where texts' contents are reduced to only positions existing in the primary genome | |
| 378 src = "%s.%s" % ( species, chromosome ) | |
| 379 ref = block.get_component_by_src( src ) | |
| 380 start_offset = ref.start - region_start | |
| 381 species_texts = {} | |
| 382 for c in block.components: | |
| 383 species_texts[ c.src.split( '.' )[0] ] = list( c.text ) | |
| 384 #remove locations which are gaps in the primary species, starting from the downstream end | |
| 385 for i in range( len( species_texts[ species ] ) - 1, -1, -1 ): | |
| 386 if species_texts[ species ][i] == '-': | |
| 387 for text in species_texts.values(): | |
| 388 text.pop( i ) | |
| 389 for spec, text in species_texts.items(): | |
| 390 species_texts[spec] = ''.join( text ) | |
| 391 return ( start_offset, species_texts ) | |
| 392 | |
| 393 #fills a region alignment | |
| 394 def fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ): | |
| 395 region = bx.intervals.Interval( start, end ) | |
| 396 region.chrom = chrom | |
| 397 region.strand = strand | |
| 398 primary_src = "%s.%s" % ( primary_species, chrom ) | |
| 399 | |
| 400 #Order blocks overlaping this position by score, lowest first | |
| 401 blocks = [] | |
| 402 for block, idx, offset in index.get_as_iterator_with_index_and_offset( primary_src, start, end ): | |
| 403 score = float( block.score ) | |
| 404 for i in range( 0, len( blocks ) ): | |
| 405 if score < blocks[i][0]: | |
| 406 blocks.insert( i, ( score, idx, offset ) ) | |
| 407 break | |
| 408 else: | |
| 409 blocks.append( ( score, idx, offset ) ) | |
| 410 | |
| 411 #gap_chars_tuple = tuple( GAP_CHARS ) | |
| 412 gap_chars_str = ''.join( GAP_CHARS ) | |
| 413 #Loop through ordered blocks and layer by increasing score | |
| 414 for block_dict in blocks: | |
| 415 for block in iter_blocks_split_by_species( block_dict[1].get_at_offset( block_dict[2] ) ): #need to handle each occurance of sequence in block seperately | |
| 416 if component_overlaps_region( block.get_component_by_src( primary_src ), region ): | |
| 417 block = chop_block_by_region( block, primary_src, region, species, mincols ) #chop block | |
| 418 block = orient_block_by_region( block, primary_src, region ) #orient block | |
| 419 start_offset, species_texts = reduce_block_by_primary_genome( block, primary_species, chrom, start ) | |
| 420 for spec, text in species_texts.items(): | |
| 421 #we should trim gaps from both sides, since these are not positions in this species genome (sequence) | |
| 422 text = text.rstrip( gap_chars_str ) | |
| 423 gap_offset = 0 | |
| 424 while True in [ text.startswith( gap_char ) for gap_char in GAP_CHARS ]: #python2.4 doesn't accept a tuple for .startswith() | |
| 425 #while text.startswith( gap_chars_tuple ): | |
| 426 gap_offset += 1 | |
| 427 text = text[1:] | |
| 428 if not text: | |
| 429 break | |
| 430 if text: | |
| 431 if overwrite_with_gaps: | |
| 432 alignment.set_range( start_offset + gap_offset, spec, text ) | |
| 433 else: | |
| 434 for i, char in enumerate( text ): | |
| 435 if char not in GAP_CHARS: | |
| 436 alignment.set_position( start_offset + gap_offset + i, spec, char ) | |
| 437 return alignment | |
| 438 | |
| 439 #returns a filled spliced region alignment for specified region with start and end lists | |
| 440 def get_spliced_region_alignment( index, primary_species, chrom, starts, ends, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ): | |
| 441 #create spliced alignment object | |
| 442 if species is not None: alignment = SplicedAlignment( starts, ends, species ) | |
| 443 else: alignment = SplicedAlignment( starts, ends, [primary_species] ) | |
| 444 for exon in alignment.exons: | |
| 445 fill_region_alignment( exon, index, primary_species, chrom, exon.start, exon.end, strand, species, mincols, overwrite_with_gaps ) | |
| 446 return alignment | |
| 447 | |
| 448 #loop through string array, only return non-commented lines | |
| 449 def line_enumerator( lines, comment_start = '#' ): | |
| 450 i = 0 | |
| 451 for line in lines: | |
| 452 if not line.startswith( comment_start ): | |
| 453 i += 1 | |
| 454 yield ( i, line ) | |
| 455 | |
| 456 #read a GeneBed file, return list of starts, ends, raw fields | |
| 457 def get_starts_ends_fields_from_gene_bed( line ): | |
| 458 #Starts and ends for exons | |
| 459 starts = [] | |
| 460 ends = [] | |
| 461 | |
| 462 fields = line.split() | |
| 463 #Requires atleast 12 BED columns | |
| 464 if len(fields) < 12: | |
| 465 raise Exception( "Not a proper 12 column BED line (%s)." % line ) | |
| 466 chrom = fields[0] | |
| 467 tx_start = int( fields[1] ) | |
| 468 tx_end = int( fields[2] ) | |
| 469 name = fields[3] | |
| 470 strand = fields[5] | |
| 471 if strand != '-': strand='+' #Default strand is + | |
| 472 cds_start = int( fields[6] ) | |
| 473 cds_end = int( fields[7] ) | |
| 474 | |
| 475 #Calculate and store starts and ends of coding exons | |
| 476 region_start, region_end = cds_start, cds_end | |
| 477 exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) ) | |
| 478 exon_starts = map( ( lambda x: x + tx_start ), exon_starts ) | |
| 479 exon_ends = map( int, fields[10].rstrip( ',' ).split( ',' ) ) | |
| 480 exon_ends = map( ( lambda x, y: x + y ), exon_starts, exon_ends ); | |
| 481 for start, end in zip( exon_starts, exon_ends ): | |
| 482 start = max( start, region_start ) | |
| 483 end = min( end, region_end ) | |
| 484 if start < end: | |
| 485 starts.append( start ) | |
| 486 ends.append( end ) | |
| 487 return ( starts, ends, fields ) | |
| 488 | |
| 489 def iter_components_by_src( block, src ): | |
| 490 for c in block.components: | |
| 491 if c.src == src: | |
| 492 yield c | |
| 493 | |
| 494 def get_components_by_src( block, src ): | |
| 495 return [ value for value in iter_components_by_src( block, src ) ] | |
| 496 | |
| 497 def iter_components_by_src_start( block, src ): | |
| 498 for c in block.components: | |
| 499 if c.src.startswith( src ): | |
| 500 yield c | |
| 501 | |
| 502 def get_components_by_src_start( block, src ): | |
| 503 return [ value for value in iter_components_by_src_start( block, src ) ] | |
| 504 | |
| 505 def sort_block_components_by_block( block1, block2 ): | |
| 506 #orders the components in block1 by the index of the component in block2 | |
| 507 #block1 must be a subset of block2 | |
| 508 #occurs in-place | |
| 509 return block1.components.sort( cmp = lambda x, y: block2.components.index( x ) - block2.components.index( y ) ) | |
| 510 | |
| 511 def get_species_in_maf( maf_filename ): | |
| 512 species = [] | |
| 513 for block in bx.align.maf.Reader( open( maf_filename ) ): | |
| 514 for spec in get_species_in_block( block ): | |
| 515 if spec not in species: | |
| 516 species.append( spec ) | |
| 517 return species | |
| 518 | |
| 519 def parse_species_option( species ): | |
| 520 if species: | |
| 521 species = species.split( ',' ) | |
| 522 if 'None' not in species: | |
| 523 return species | |
| 524 return None #provided species was '', None, or had 'None' in it | |
| 525 | |
| 526 def remove_temp_index_file( index_filename ): | |
| 527 try: os.unlink( index_filename ) | |
| 528 except: pass | |
| 529 | |
| 530 #Below are methods to deal with FASTA files | |
| 531 | |
| 532 def get_fasta_header( component, attributes = {}, suffix = None ): | |
| 533 header = ">%s(%s):%i-%i|" % ( component.src, component.strand, component.get_forward_strand_start(), component.get_forward_strand_end() ) | |
| 534 for key, value in attributes.iteritems(): | |
| 535 header = "%s%s=%s|" % ( header, key, value ) | |
| 536 if suffix: | |
| 537 header = "%s%s" % ( header, suffix ) | |
| 538 else: | |
| 539 header = "%s%s" % ( header, src_split( component.src )[ 0 ] ) | |
| 540 return header | |
| 541 | |
| 542 def get_attributes_from_fasta_header( header ): | |
| 543 if not header: return {} | |
| 544 attributes = {} | |
| 545 header = header.lstrip( '>' ) | |
| 546 header = header.strip() | |
| 547 fields = header.split( '|' ) | |
| 548 try: | |
| 549 region = fields[0] | |
| 550 region = region.split( '(', 1 ) | |
| 551 temp = region[0].split( '.', 1 ) | |
| 552 attributes['species'] = temp[0] | |
| 553 if len( temp ) == 2: | |
| 554 attributes['chrom'] = temp[1] | |
| 555 else: | |
| 556 attributes['chrom'] = temp[0] | |
| 557 region = region[1].split( ')', 1 ) | |
| 558 attributes['strand'] = region[0] | |
| 559 region = region[1].lstrip( ':' ).split( '-' ) | |
| 560 attributes['start'] = int( region[0] ) | |
| 561 attributes['end'] = int( region[1] ) | |
| 562 except: | |
| 563 #fields 0 is not a region coordinate | |
| 564 pass | |
| 565 if len( fields ) > 2: | |
| 566 for i in xrange( 1, len( fields ) - 1 ): | |
| 567 prop = fields[i].split( '=', 1 ) | |
| 568 if len( prop ) == 2: | |
| 569 attributes[ prop[0] ] = prop[1] | |
| 570 if len( fields ) > 1: | |
| 571 attributes['__suffix__'] = fields[-1] | |
| 572 return attributes | |
| 573 | |
| 574 def iter_fasta_alignment( filename ): | |
| 575 class fastaComponent: | |
| 576 def __init__( self, species, text = "" ): | |
| 577 self.species = species | |
| 578 self.text = text | |
| 579 def extend( self, text ): | |
| 580 self.text = self.text + text.replace( '\n', '' ).replace( '\r', '' ).strip() | |
| 581 #yields a list of fastaComponents for a FASTA file | |
| 582 f = open( filename, 'rb' ) | |
| 583 components = [] | |
| 584 #cur_component = None | |
| 585 while True: | |
| 586 line = f.readline() | |
| 587 if not line: | |
| 588 if components: | |
| 589 yield components | |
| 590 return | |
| 591 line = line.strip() | |
| 592 if not line: | |
| 593 if components: | |
| 594 yield components | |
| 595 components = [] | |
| 596 elif line.startswith( '>' ): | |
| 597 attributes = get_attributes_from_fasta_header( line ) | |
| 598 components.append( fastaComponent( attributes['species'] ) ) | |
| 599 elif components: | |
| 600 components[-1].extend( line ) | |
| 601 |
