| 0 | 1 #!/usr/bin/env python | 
|  | 2 """ | 
|  | 3 Provides wrappers and utilities for working with MAF files and alignments. | 
|  | 4 """ | 
|  | 5 #Dan Blankenberg | 
|  | 6 import bx.align.maf | 
|  | 7 import bx.intervals | 
|  | 8 import bx.interval_index_file | 
|  | 9 import sys, os, string, tempfile | 
|  | 10 import logging | 
|  | 11 from copy import deepcopy | 
|  | 12 | 
|  | 13 assert sys.version_info[:2] >= ( 2, 4 ) | 
|  | 14 | 
|  | 15 log = logging.getLogger(__name__) | 
|  | 16 | 
|  | 17 | 
|  | 18 GAP_CHARS = [ '-' ] | 
|  | 19 SRC_SPLIT_CHAR = '.' | 
|  | 20 | 
|  | 21 def src_split( src ): | 
|  | 22     fields = src.split( SRC_SPLIT_CHAR, 1 ) | 
|  | 23     spec = fields.pop( 0 ) | 
|  | 24     if fields: | 
|  | 25         chrom = fields.pop( 0 ) | 
|  | 26     else: | 
|  | 27         chrom = spec | 
|  | 28     return spec, chrom | 
|  | 29 | 
|  | 30 def src_merge( spec, chrom, contig = None ): | 
|  | 31     if None in [ spec, chrom ]: | 
|  | 32         spec = chrom = spec or chrom | 
|  | 33     return bx.align.maf.src_merge( spec, chrom, contig ) | 
|  | 34 | 
|  | 35 def get_species_in_block( block ): | 
|  | 36     species = [] | 
|  | 37     for c in block.components: | 
|  | 38         spec, chrom = src_split( c.src ) | 
|  | 39         if spec not in species: | 
|  | 40             species.append( spec ) | 
|  | 41     return species | 
|  | 42 | 
|  | 43 def tool_fail( msg = "Unknown Error" ): | 
|  | 44     print >> sys.stderr, "Fatal Error: %s" % msg | 
|  | 45     sys.exit() | 
|  | 46 | 
|  | 47 #an object corresponding to a reference layered alignment | 
|  | 48 class RegionAlignment( object ): | 
|  | 49 | 
|  | 50     DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" ) | 
|  | 51     MAX_SEQUENCE_SIZE = sys.maxint #Maximum length of sequence allowed | 
|  | 52 | 
|  | 53     def __init__( self, size, species = [] ): | 
|  | 54         assert size <= self.MAX_SEQUENCE_SIZE, "Maximum length allowed for an individual sequence has been exceeded (%i > %i)." % ( size, self.MAX_SEQUENCE_SIZE ) | 
|  | 55         self.size = size | 
|  | 56         self.sequences = {} | 
|  | 57         if not isinstance( species, list ): | 
|  | 58             species = [species] | 
|  | 59         for spec in species: | 
|  | 60             self.add_species( spec ) | 
|  | 61 | 
|  | 62     #add a species to the alignment | 
|  | 63     def add_species( self, species ): | 
|  | 64         #make temporary sequence files | 
|  | 65         self.sequences[species] = tempfile.TemporaryFile() | 
|  | 66         self.sequences[species].write( "-" * self.size ) | 
|  | 67 | 
|  | 68     #returns the names for species found in alignment, skipping names as requested | 
|  | 69     def get_species_names( self, skip = [] ): | 
|  | 70         if not isinstance( skip, list ): skip = [skip] | 
|  | 71         names = self.sequences.keys() | 
|  | 72         for name in skip: | 
|  | 73             try: names.remove( name ) | 
|  | 74             except: pass | 
|  | 75         return names | 
|  | 76 | 
|  | 77     #returns the sequence for a species | 
|  | 78     def get_sequence( self, species ): | 
|  | 79         self.sequences[species].seek( 0 ) | 
|  | 80         return self.sequences[species].read() | 
|  | 81 | 
|  | 82     #returns the reverse complement of the sequence for a species | 
|  | 83     def get_sequence_reverse_complement( self, species ): | 
|  | 84         complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )] | 
|  | 85         complement.reverse() | 
|  | 86         return "".join( complement ) | 
|  | 87 | 
|  | 88     #sets a position for a species | 
|  | 89     def set_position( self, index, species, base ): | 
|  | 90         if len( base ) != 1: raise Exception( "A genomic position can only have a length of 1." ) | 
|  | 91         return self.set_range( index, species, base ) | 
|  | 92     #sets a range for a species | 
|  | 93     def set_range( self, index, species, bases ): | 
|  | 94         if index >= self.size or index < 0: raise Exception( "Your index (%i) is out of range (0 - %i)." % ( index, self.size - 1 ) ) | 
|  | 95         if len( bases ) == 0: raise Exception( "A set of genomic positions can only have a positive length." ) | 
|  | 96         if species not in self.sequences.keys(): self.add_species( species ) | 
|  | 97         self.sequences[species].seek( index ) | 
|  | 98         self.sequences[species].write( bases ) | 
|  | 99 | 
|  | 100     #Flush temp file of specified species, or all species | 
|  | 101     def flush( self, species = None ): | 
|  | 102         if species is None: | 
|  | 103             species = self.sequences.keys() | 
|  | 104         elif not isinstance( species, list ): | 
|  | 105             species = [species] | 
|  | 106         for spec in species: | 
|  | 107             self.sequences[spec].flush() | 
|  | 108 | 
|  | 109 class GenomicRegionAlignment( RegionAlignment ): | 
|  | 110 | 
|  | 111     def __init__( self, start, end, species = [] ): | 
|  | 112         RegionAlignment.__init__( self, end - start, species ) | 
|  | 113         self.start = start | 
|  | 114         self.end = end | 
|  | 115 | 
|  | 116 class SplicedAlignment( object ): | 
|  | 117 | 
|  | 118     DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" ) | 
|  | 119 | 
|  | 120     def __init__( self, exon_starts, exon_ends, species = [] ): | 
|  | 121         if not isinstance( exon_starts, list ): | 
|  | 122             exon_starts = [exon_starts] | 
|  | 123         if not isinstance( exon_ends, list ): | 
|  | 124             exon_ends = [exon_ends] | 
|  | 125         assert len( exon_starts ) == len( exon_ends ), "The number of starts does not match the number of sizes." | 
|  | 126         self.exons = [] | 
|  | 127         for i in range( len( exon_starts ) ): | 
|  | 128             self.exons.append( GenomicRegionAlignment( exon_starts[i], exon_ends[i], species ) ) | 
|  | 129 | 
|  | 130     #returns the names for species found in alignment, skipping names as requested | 
|  | 131     def get_species_names( self, skip = [] ): | 
|  | 132         if not isinstance( skip, list ): skip = [skip] | 
|  | 133         names = [] | 
|  | 134         for exon in self.exons: | 
|  | 135             for name in exon.get_species_names( skip = skip ): | 
|  | 136                 if name not in names: | 
|  | 137                     names.append( name ) | 
|  | 138         return names | 
|  | 139 | 
|  | 140     #returns the sequence for a species | 
|  | 141     def get_sequence( self, species ): | 
|  | 142         sequence = tempfile.TemporaryFile() | 
|  | 143         for exon in self.exons: | 
|  | 144             if species in exon.get_species_names(): | 
|  | 145                 sequence.write( exon.get_sequence( species ) ) | 
|  | 146             else: | 
|  | 147                 sequence.write( "-" * exon.size ) | 
|  | 148         sequence.seek( 0 ) | 
|  | 149         return sequence.read() | 
|  | 150 | 
|  | 151     #returns the reverse complement of the sequence for a species | 
|  | 152     def get_sequence_reverse_complement( self, species ): | 
|  | 153         complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )] | 
|  | 154         complement.reverse() | 
|  | 155         return "".join( complement ) | 
|  | 156 | 
|  | 157     #Start and end of coding region | 
|  | 158     @property | 
|  | 159     def start( self ): | 
|  | 160         return self.exons[0].start | 
|  | 161     @property | 
|  | 162     def end( self ): | 
|  | 163         return self.exons[-1].end | 
|  | 164 | 
|  | 165 #Open a MAF index using a UID | 
|  | 166 def maf_index_by_uid( maf_uid, index_location_file ): | 
|  | 167     for line in open( index_location_file ): | 
|  | 168         try: | 
|  | 169             #read each line, if not enough fields, go to next line | 
|  | 170             if line[0:1] == "#" : continue | 
|  | 171             fields = line.split('\t') | 
|  | 172             if maf_uid == fields[1]: | 
|  | 173                 try: | 
|  | 174                     maf_files = fields[4].replace( "\n", "" ).replace( "\r", "" ).split( "," ) | 
|  | 175                     return bx.align.maf.MultiIndexed( maf_files, keep_open = True, parse_e_rows = False ) | 
|  | 176                 except Exception, e: | 
|  | 177                     raise Exception( 'MAF UID (%s) found, but configuration appears to be malformed: %s' % ( maf_uid, e ) ) | 
|  | 178         except: | 
|  | 179             pass | 
|  | 180     return None | 
|  | 181 | 
|  | 182 #return ( index, temp_index_filename ) for user maf, if available, or build one and return it, return None when no tempfile is created | 
|  | 183 def open_or_build_maf_index( maf_file, index_filename, species = None ): | 
|  | 184     try: | 
|  | 185         return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), None ) | 
|  | 186     except: | 
|  | 187         return build_maf_index( maf_file, species = species ) | 
|  | 188 | 
|  | 189 def build_maf_index_species_chromosomes( filename, index_species = None ): | 
|  | 190     species = [] | 
|  | 191     species_chromosomes = {} | 
|  | 192     indexes = bx.interval_index_file.Indexes() | 
|  | 193     blocks = 0 | 
|  | 194     try: | 
|  | 195         maf_reader = bx.align.maf.Reader( open( filename ) ) | 
|  | 196         while True: | 
|  | 197             pos = maf_reader.file.tell() | 
|  | 198             block = maf_reader.next() | 
|  | 199             if block is None: | 
|  | 200                 break | 
|  | 201             blocks += 1 | 
|  | 202             for c in block.components: | 
|  | 203                 spec = c.src | 
|  | 204                 chrom = None | 
|  | 205                 if "." in spec: | 
|  | 206                     spec, chrom = spec.split( ".", 1 ) | 
|  | 207                 if spec not in species: | 
|  | 208                     species.append( spec ) | 
|  | 209                     species_chromosomes[spec] = [] | 
|  | 210                 if chrom and chrom not in species_chromosomes[spec]: | 
|  | 211                     species_chromosomes[spec].append( chrom ) | 
|  | 212                 if index_species is None or spec in index_species: | 
|  | 213                     forward_strand_start = c.forward_strand_start | 
|  | 214                     forward_strand_end = c.forward_strand_end | 
|  | 215                     try: | 
|  | 216                         forward_strand_start = int( forward_strand_start ) | 
|  | 217                         forward_strand_end = int( forward_strand_end ) | 
|  | 218                     except ValueError: | 
|  | 219                         continue #start and end are not integers, can't add component to index, goto next component | 
|  | 220                         #this likely only occurs when parse_e_rows is True? | 
|  | 221                         #could a species exist as only e rows? should the | 
|  | 222                     if forward_strand_end > forward_strand_start: | 
|  | 223                         #require positive length; i.e. certain lines have start = end = 0 and cannot be indexed | 
|  | 224                         indexes.add( c.src, forward_strand_start, forward_strand_end, pos, max=c.src_size ) | 
|  | 225     except Exception, e: | 
|  | 226         #most likely a bad MAF | 
|  | 227         log.debug( 'Building MAF index on %s failed: %s' % ( filename, e ) ) | 
|  | 228         return ( None, [], {}, 0 ) | 
|  | 229     return ( indexes, species, species_chromosomes, blocks ) | 
|  | 230 | 
|  | 231 #builds and returns ( index, index_filename ) for specified maf_file | 
|  | 232 def build_maf_index( maf_file, species = None ): | 
|  | 233     indexes, found_species, species_chromosomes, blocks = build_maf_index_species_chromosomes( maf_file, species ) | 
|  | 234     if indexes is not None: | 
|  | 235         fd, index_filename = tempfile.mkstemp() | 
|  | 236         out = os.fdopen( fd, 'w' ) | 
|  | 237         indexes.write( out ) | 
|  | 238         out.close() | 
|  | 239         return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), index_filename ) | 
|  | 240     return ( None, None ) | 
|  | 241 | 
|  | 242 def component_overlaps_region( c, region ): | 
|  | 243     if c is None: return False | 
|  | 244     start, end = c.get_forward_strand_start(), c.get_forward_strand_end() | 
|  | 245     if region.start >= end or region.end <= start: | 
|  | 246         return False | 
|  | 247     return True | 
|  | 248 | 
|  | 249 def chop_block_by_region( block, src, region, species = None, mincols = 0 ): | 
|  | 250     # This chopping method was designed to maintain consistency with how start/end padding gaps have been working in Galaxy thus far: | 
|  | 251     #   behavior as seen when forcing blocks to be '+' relative to src sequence (ref) and using block.slice_by_component( ref, slice_start, slice_end ) | 
|  | 252     #   whether-or-not this is the 'correct' behavior is questionable, but this will at least maintain consistency | 
|  | 253     # comments welcome | 
|  | 254     slice_start = block.text_size #max for the min() | 
|  | 255     slice_end = 0 #min for the max() | 
|  | 256     old_score = block.score #save old score for later use | 
|  | 257     # We no longer assume only one occurance of src per block, so we need to check them all | 
|  | 258     for c in iter_components_by_src( block, src ): | 
|  | 259         if component_overlaps_region( c, region ): | 
|  | 260             if c.text is not None: | 
|  | 261                 rev_strand = False | 
|  | 262                 if c.strand == "-": | 
|  | 263                     #We want our coord_to_col coordinates to be returned from positive stranded component | 
|  | 264                     rev_strand = True | 
|  | 265                     c = c.reverse_complement() | 
|  | 266                 start = max( region.start, c.start ) | 
|  | 267                 end = min( region.end, c.end ) | 
|  | 268                 start = c.coord_to_col( start ) | 
|  | 269                 end = c.coord_to_col( end ) | 
|  | 270                 if rev_strand: | 
|  | 271                     #need to orient slice coordinates to the original block direction | 
|  | 272                     slice_len = end - start | 
|  | 273                     end = len( c.text ) - start | 
|  | 274                     start = end - slice_len | 
|  | 275                 slice_start = min( start, slice_start ) | 
|  | 276                 slice_end = max( end, slice_end ) | 
|  | 277 | 
|  | 278     if slice_start < slice_end: | 
|  | 279         block = block.slice( slice_start, slice_end ) | 
|  | 280         if block.text_size > mincols: | 
|  | 281             # restore old score, may not be accurate, but it is better than 0 for everything? | 
|  | 282             block.score = old_score | 
|  | 283             if species is not None: | 
|  | 284                 block = block.limit_to_species( species ) | 
|  | 285                 block.remove_all_gap_columns() | 
|  | 286             return block | 
|  | 287     return None | 
|  | 288 | 
|  | 289 def orient_block_by_region( block, src, region, force_strand = None ): | 
|  | 290     #loop through components matching src, | 
|  | 291     #make sure each of these components overlap region | 
|  | 292     #cache strand for each of overlaping regions | 
|  | 293     #if force_strand / region.strand not in strand cache, reverse complement | 
|  | 294     ### we could have 2 sequences with same src, overlapping region, on different strands, this would cause no reverse_complementing | 
|  | 295     strands = [ c.strand for c in iter_components_by_src( block, src ) if component_overlaps_region( c, region ) ] | 
|  | 296     if strands and ( force_strand is None and region.strand not in strands ) or ( force_strand is not None and force_strand not in strands ): | 
|  | 297         block = block.reverse_complement() | 
|  | 298     return block | 
|  | 299 | 
|  | 300 def get_oriented_chopped_blocks_for_region( index, src, region, species = None, mincols = 0, force_strand = None ): | 
|  | 301     for block, idx, offset in get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols, force_strand ): | 
|  | 302         yield block | 
|  | 303 def get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0, force_strand = None ): | 
|  | 304     for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ): | 
|  | 305         yield orient_block_by_region( block, src, region, force_strand ), idx, offset | 
|  | 306 | 
|  | 307 #split a block with multiple occurances of src into one block per src | 
|  | 308 def iter_blocks_split_by_src( block, src ): | 
|  | 309     for src_c in iter_components_by_src( block, src ): | 
|  | 310         new_block = bx.align.Alignment( score=block.score, attributes=deepcopy( block.attributes ) ) | 
|  | 311         new_block.text_size = block.text_size | 
|  | 312         for c in block.components: | 
|  | 313             if c == src_c or c.src != src: | 
|  | 314                 new_block.add_component( deepcopy( c ) ) #components have reference to alignment, dont want to loose reference to original alignment block in original components | 
|  | 315         yield new_block | 
|  | 316 | 
|  | 317 #split a block into multiple blocks with all combinations of a species appearing only once per block | 
|  | 318 def iter_blocks_split_by_species( block, species = None ): | 
|  | 319     def __split_components_by_species( components_by_species, new_block ): | 
|  | 320         if components_by_species: | 
|  | 321             #more species with components to add to this block | 
|  | 322             components_by_species = deepcopy( components_by_species ) | 
|  | 323             spec_comps = components_by_species.pop( 0 ) | 
|  | 324             for c in spec_comps: | 
|  | 325                 newer_block = deepcopy( new_block ) | 
|  | 326                 newer_block.add_component( deepcopy( c ) ) | 
|  | 327                 for value in __split_components_by_species( components_by_species, newer_block ): | 
|  | 328                     yield value | 
|  | 329         else: | 
|  | 330             #no more components to add, yield this block | 
|  | 331             yield new_block | 
|  | 332 | 
|  | 333     #divide components by species | 
|  | 334     spec_dict = {} | 
|  | 335     if not species: | 
|  | 336         species = [] | 
|  | 337         for c in block.components: | 
|  | 338             spec, chrom = src_split( c.src ) | 
|  | 339             if spec not in spec_dict: | 
|  | 340                 spec_dict[ spec ] = [] | 
|  | 341                 species.append( spec ) | 
|  | 342             spec_dict[ spec ].append( c ) | 
|  | 343     else: | 
|  | 344         for spec in species: | 
|  | 345             spec_dict[ spec ] = [] | 
|  | 346             for c in iter_components_by_src_start( block, spec ): | 
|  | 347                 spec_dict[ spec ].append( c ) | 
|  | 348 | 
|  | 349     empty_block = bx.align.Alignment( score=block.score, attributes=deepcopy( block.attributes ) ) #should we copy attributes? | 
|  | 350     empty_block.text_size = block.text_size | 
|  | 351     #call recursive function to split into each combo of spec/blocks | 
|  | 352     for value in __split_components_by_species( spec_dict.values(), empty_block ): | 
|  | 353         sort_block_components_by_block( value, block ) #restore original component order | 
|  | 354         yield value | 
|  | 355 | 
|  | 356 | 
|  | 357 #generator yielding only chopped and valid blocks for a specified region | 
|  | 358 def get_chopped_blocks_for_region( index, src, region, species = None, mincols = 0 ): | 
|  | 359     for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ): | 
|  | 360         yield block | 
|  | 361 def get_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0 ): | 
|  | 362     for block, idx, offset in index.get_as_iterator_with_index_and_offset( src, region.start, region.end ): | 
|  | 363         block = chop_block_by_region( block, src, region, species, mincols ) | 
|  | 364         if block is not None: | 
|  | 365             yield block, idx, offset | 
|  | 366 | 
|  | 367 #returns a filled region alignment for specified regions | 
|  | 368 def get_region_alignment( index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ): | 
|  | 369     if species is not None: alignment = RegionAlignment( end - start, species ) | 
|  | 370     else: alignment = RegionAlignment( end - start, primary_species ) | 
|  | 371     return fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand, species, mincols, overwrite_with_gaps ) | 
|  | 372 | 
|  | 373 #reduces a block to only positions exisiting in the src provided | 
|  | 374 def reduce_block_by_primary_genome( block, species, chromosome, region_start ): | 
|  | 375     #returns ( startIndex, {species:texts} | 
|  | 376     #where texts' contents are reduced to only positions existing in the primary genome | 
|  | 377     src = "%s.%s" % ( species, chromosome ) | 
|  | 378     ref = block.get_component_by_src( src ) | 
|  | 379     start_offset = ref.start - region_start | 
|  | 380     species_texts = {} | 
|  | 381     for c in block.components: | 
|  | 382         species_texts[ c.src.split( '.' )[0] ] = list( c.text ) | 
|  | 383     #remove locations which are gaps in the primary species, starting from the downstream end | 
|  | 384     for i in range( len( species_texts[ species ] ) - 1, -1, -1 ): | 
|  | 385         if species_texts[ species ][i] == '-': | 
|  | 386             for text in species_texts.values(): | 
|  | 387                 text.pop( i ) | 
|  | 388     for spec, text in species_texts.items(): | 
|  | 389         species_texts[spec] = ''.join( text ) | 
|  | 390     return ( start_offset, species_texts ) | 
|  | 391 | 
|  | 392 #fills a region alignment | 
|  | 393 def fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ): | 
|  | 394     region = bx.intervals.Interval( start, end ) | 
|  | 395     region.chrom = chrom | 
|  | 396     region.strand = strand | 
|  | 397     primary_src = "%s.%s" % ( primary_species, chrom ) | 
|  | 398 | 
|  | 399     #Order blocks overlaping this position by score, lowest first | 
|  | 400     blocks = [] | 
|  | 401     for block, idx, offset in index.get_as_iterator_with_index_and_offset( primary_src, start, end ): | 
|  | 402         score = float( block.score ) | 
|  | 403         for i in range( 0, len( blocks ) ): | 
|  | 404             if score < blocks[i][0]: | 
|  | 405                 blocks.insert( i, ( score, idx, offset ) ) | 
|  | 406                 break | 
|  | 407         else: | 
|  | 408             blocks.append( ( score, idx, offset ) ) | 
|  | 409 | 
|  | 410     #gap_chars_tuple = tuple( GAP_CHARS ) | 
|  | 411     gap_chars_str = ''.join( GAP_CHARS ) | 
|  | 412     #Loop through ordered blocks and layer by increasing score | 
|  | 413     for block_dict in blocks: | 
|  | 414         for block in iter_blocks_split_by_species( block_dict[1].get_at_offset( block_dict[2] ) ): #need to handle each occurance of sequence in block seperately | 
|  | 415             if component_overlaps_region( block.get_component_by_src( primary_src ), region ): | 
|  | 416                 block = chop_block_by_region( block, primary_src, region, species, mincols ) #chop block | 
|  | 417                 block = orient_block_by_region( block, primary_src, region ) #orient block | 
|  | 418                 start_offset, species_texts = reduce_block_by_primary_genome( block, primary_species, chrom, start ) | 
|  | 419                 for spec, text in species_texts.items(): | 
|  | 420                     #we should trim gaps from both sides, since these are not positions in this species genome (sequence) | 
|  | 421                     text = text.rstrip( gap_chars_str ) | 
|  | 422                     gap_offset = 0 | 
|  | 423                     while True in [ text.startswith( gap_char ) for gap_char in GAP_CHARS ]: #python2.4 doesn't accept a tuple for .startswith() | 
|  | 424                     #while text.startswith( gap_chars_tuple ): | 
|  | 425                         gap_offset += 1 | 
|  | 426                         text = text[1:] | 
|  | 427                         if not text: | 
|  | 428                             break | 
|  | 429                     if text: | 
|  | 430                         if overwrite_with_gaps: | 
|  | 431                             alignment.set_range( start_offset + gap_offset, spec, text ) | 
|  | 432                         else: | 
|  | 433                             for i, char in enumerate( text ): | 
|  | 434                                 if char not in GAP_CHARS: | 
|  | 435                                     alignment.set_position( start_offset + gap_offset + i, spec, char ) | 
|  | 436     return alignment | 
|  | 437 | 
|  | 438 #returns a filled spliced region alignment for specified region with start and end lists | 
|  | 439 def get_spliced_region_alignment( index, primary_species, chrom, starts, ends, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ): | 
|  | 440     #create spliced alignment object | 
|  | 441     if species is not None: alignment = SplicedAlignment( starts, ends, species ) | 
|  | 442     else: alignment = SplicedAlignment( starts, ends, [primary_species] ) | 
|  | 443     for exon in alignment.exons: | 
|  | 444         fill_region_alignment( exon, index, primary_species, chrom, exon.start, exon.end, strand, species, mincols, overwrite_with_gaps ) | 
|  | 445     return alignment | 
|  | 446 | 
|  | 447 #loop through string array, only return non-commented lines | 
|  | 448 def line_enumerator( lines, comment_start = '#' ): | 
|  | 449     i = 0 | 
|  | 450     for line in lines: | 
|  | 451         if not line.startswith( comment_start ): | 
|  | 452             i += 1 | 
|  | 453             yield ( i, line ) | 
|  | 454 | 
|  | 455 #read a GeneBed file, return list of starts, ends, raw fields | 
|  | 456 def get_starts_ends_fields_from_gene_bed( line ): | 
|  | 457     #Starts and ends for exons | 
|  | 458     starts = [] | 
|  | 459     ends = [] | 
|  | 460 | 
|  | 461     fields = line.split() | 
|  | 462     #Requires atleast 12 BED columns | 
|  | 463     if len(fields) < 12: | 
|  | 464         raise Exception( "Not a proper 12 column BED line (%s)." % line ) | 
|  | 465     chrom     = fields[0] | 
|  | 466     tx_start  = int( fields[1] ) | 
|  | 467     tx_end    = int( fields[2] ) | 
|  | 468     name      = fields[3] | 
|  | 469     strand    = fields[5] | 
|  | 470     if strand != '-': strand='+' #Default strand is + | 
|  | 471     cds_start = int( fields[6] ) | 
|  | 472     cds_end   = int( fields[7] ) | 
|  | 473 | 
|  | 474     #Calculate and store starts and ends of coding exons | 
|  | 475     region_start, region_end = cds_start, cds_end | 
|  | 476     exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) ) | 
|  | 477     exon_starts = map( ( lambda x: x + tx_start ), exon_starts ) | 
|  | 478     exon_ends = map( int, fields[10].rstrip( ',' ).split( ',' ) ) | 
|  | 479     exon_ends = map( ( lambda x, y: x + y ), exon_starts, exon_ends ); | 
|  | 480     for start, end in zip( exon_starts, exon_ends ): | 
|  | 481         start = max( start, region_start ) | 
|  | 482         end = min( end, region_end ) | 
|  | 483         if start < end: | 
|  | 484             starts.append( start ) | 
|  | 485             ends.append( end ) | 
|  | 486     return ( starts, ends, fields ) | 
|  | 487 | 
|  | 488 def iter_components_by_src( block, src ): | 
|  | 489     for c in block.components: | 
|  | 490         if c.src == src: | 
|  | 491             yield c | 
|  | 492 | 
|  | 493 def get_components_by_src( block, src ): | 
|  | 494     return [ value for value in iter_components_by_src( block, src ) ] | 
|  | 495 | 
|  | 496 def iter_components_by_src_start( block, src ): | 
|  | 497     for c in block.components: | 
|  | 498         if c.src.startswith( src ): | 
|  | 499             yield c | 
|  | 500 | 
|  | 501 def get_components_by_src_start( block, src ): | 
|  | 502     return [ value for value in iter_components_by_src_start( block, src ) ] | 
|  | 503 | 
|  | 504 def sort_block_components_by_block( block1, block2 ): | 
|  | 505     #orders the components in block1 by the index of the component in block2 | 
|  | 506     #block1 must be a subset of block2 | 
|  | 507     #occurs in-place | 
|  | 508     return block1.components.sort( cmp = lambda x, y: block2.components.index( x ) - block2.components.index( y ) ) | 
|  | 509 | 
|  | 510 def get_species_in_maf( maf_filename ): | 
|  | 511     species = [] | 
|  | 512     for block in bx.align.maf.Reader( open( maf_filename ) ): | 
|  | 513         for spec in get_species_in_block( block ): | 
|  | 514             if spec not in species: | 
|  | 515                 species.append( spec ) | 
|  | 516     return species | 
|  | 517 | 
|  | 518 def parse_species_option( species ): | 
|  | 519     if species: | 
|  | 520         species = species.split( ',' ) | 
|  | 521         if 'None' not in species: | 
|  | 522             return species | 
|  | 523     return None #provided species was '', None, or had 'None' in it | 
|  | 524 | 
|  | 525 def remove_temp_index_file( index_filename ): | 
|  | 526     try: os.unlink( index_filename ) | 
|  | 527     except: pass | 
|  | 528 | 
|  | 529 #Below are methods to deal with FASTA files | 
|  | 530 | 
|  | 531 def get_fasta_header( component, attributes = {}, suffix = None ): | 
|  | 532     header = ">%s(%s):%i-%i|" % ( component.src, component.strand, component.get_forward_strand_start(), component.get_forward_strand_end() ) | 
|  | 533     for key, value in attributes.iteritems(): | 
|  | 534         header = "%s%s=%s|" % ( header, key, value ) | 
|  | 535     if suffix: | 
|  | 536         header = "%s%s" % ( header, suffix ) | 
|  | 537     else: | 
|  | 538         header = "%s%s" % ( header, src_split( component.src )[ 0 ] ) | 
|  | 539     return header | 
|  | 540 | 
|  | 541 def get_attributes_from_fasta_header( header ): | 
|  | 542     if not header: return {} | 
|  | 543     attributes = {} | 
|  | 544     header = header.lstrip( '>' ) | 
|  | 545     header = header.strip() | 
|  | 546     fields = header.split( '|' ) | 
|  | 547     try: | 
|  | 548         region = fields[0] | 
|  | 549         region = region.split( '(', 1 ) | 
|  | 550         temp = region[0].split( '.', 1 ) | 
|  | 551         attributes['species'] = temp[0] | 
|  | 552         if len( temp ) == 2: | 
|  | 553             attributes['chrom'] = temp[1] | 
|  | 554         else: | 
|  | 555             attributes['chrom'] = temp[0] | 
|  | 556         region = region[1].split( ')', 1 ) | 
|  | 557         attributes['strand'] = region[0] | 
|  | 558         region = region[1].lstrip( ':' ).split( '-' ) | 
|  | 559         attributes['start'] = int( region[0] ) | 
|  | 560         attributes['end'] = int( region[1] ) | 
|  | 561     except: | 
|  | 562         #fields 0 is not a region coordinate | 
|  | 563         pass | 
|  | 564     if len( fields ) > 2: | 
|  | 565         for i in xrange( 1, len( fields ) - 1 ): | 
|  | 566             prop = fields[i].split( '=', 1 ) | 
|  | 567             if len( prop ) == 2: | 
|  | 568                 attributes[ prop[0] ] = prop[1] | 
|  | 569     if len( fields ) > 1: | 
|  | 570         attributes['__suffix__'] = fields[-1] | 
|  | 571     return attributes | 
|  | 572 | 
|  | 573 def iter_fasta_alignment( filename ): | 
|  | 574     class fastaComponent: | 
|  | 575         def __init__( self, species, text = "" ): | 
|  | 576             self.species = species | 
|  | 577             self.text = text | 
|  | 578         def extend( self, text ): | 
|  | 579             self.text = self.text + text.replace( '\n', '' ).replace( '\r', '' ).strip() | 
|  | 580     #yields a list of fastaComponents for a FASTA file | 
|  | 581     f = open( filename, 'rb' ) | 
|  | 582     components = [] | 
|  | 583     #cur_component = None | 
|  | 584     while True: | 
|  | 585         line = f.readline() | 
|  | 586         if not line: | 
|  | 587             if components: | 
|  | 588                 yield components | 
|  | 589             return | 
|  | 590         line = line.strip() | 
|  | 591         if not line: | 
|  | 592             if components: | 
|  | 593                 yield components | 
|  | 594             components = [] | 
|  | 595         elif line.startswith( '>' ): | 
|  | 596             attributes = get_attributes_from_fasta_header( line ) | 
|  | 597             components.append( fastaComponent( attributes['species'] ) ) | 
|  | 598         elif components: | 
|  | 599             components[-1].extend( line ) | 
|  | 600 |