comparison utils/maf_utilities.py @ 2:16df616b39e5 draft

"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/fasta_concatenate_by_species commit cd1ed08574b749eee2a3f6e6151dbb0c8ca15bbf"
author devteam
date Sun, 01 Mar 2020 07:24:26 -0500
parents 717aee069681
children 25b8736c627a
comparison
equal deleted inserted replaced
1:717aee069681 2:16df616b39e5
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 """ 2 """
3 Provides wrappers and utilities for working with MAF files and alignments. 3 Provides wrappers and utilities for working with MAF files and alignments.
4 """ 4 """
5 #Dan Blankenberg 5 # Dan Blankenberg
6 import bx.align.maf 6 import bx.align.maf
7 import bx.intervals 7 import bx.intervals
8 import bx.interval_index_file 8 import bx.interval_index_file
9 import sys, os, string, tempfile 9 import sys
10 import os
11 import tempfile
10 import logging 12 import logging
11 from copy import deepcopy 13 from copy import deepcopy
12 14
13 assert sys.version_info[:2] >= ( 2, 4 ) 15 try:
16 maketrans = str.maketrans
17 except AttributeError:
18 from string import maketrans
19
20 assert sys.version_info[:2] >= (2, 4)
14 21
15 log = logging.getLogger(__name__) 22 log = logging.getLogger(__name__)
16 23
17 24
18 GAP_CHARS = [ '-' ] 25 GAP_CHARS = ['-']
19 SRC_SPLIT_CHAR = '.' 26 SRC_SPLIT_CHAR = '.'
20 27
21 def src_split( src ): 28
22 fields = src.split( SRC_SPLIT_CHAR, 1 ) 29 def src_split(src):
23 spec = fields.pop( 0 ) 30 fields = src.split(SRC_SPLIT_CHAR, 1)
31 spec = fields.pop(0)
24 if fields: 32 if fields:
25 chrom = fields.pop( 0 ) 33 chrom = fields.pop(0)
26 else: 34 else:
27 chrom = spec 35 chrom = spec
28 return spec, chrom 36 return spec, chrom
29 37
30 def src_merge( spec, chrom, contig = None ): 38
31 if None in [ spec, chrom ]: 39 def src_merge(spec, chrom, contig=None):
40 if None in [spec, chrom]:
32 spec = chrom = spec or chrom 41 spec = chrom = spec or chrom
33 return bx.align.maf.src_merge( spec, chrom, contig ) 42 return bx.align.maf.src_merge(spec, chrom, contig)
34 43
35 def get_species_in_block( block ): 44
45 def get_species_in_block(block):
36 species = [] 46 species = []
37 for c in block.components: 47 for c in block.components:
38 spec, chrom = src_split( c.src ) 48 spec, chrom = src_split(c.src)
39 if spec not in species: 49 if spec not in species:
40 species.append( spec ) 50 species.append(spec)
41 return species 51 return species
42 52
43 def tool_fail( msg = "Unknown Error" ): 53
44 print >> sys.stderr, "Fatal Error: %s" % msg 54 def tool_fail(msg="Unknown Error"):
45 sys.exit() 55 msg = "Fatal Error: %s" % msg
46 56 sys.exit(msg)
47 #an object corresponding to a reference layered alignment 57
48 class RegionAlignment( object ): 58 # an object corresponding to a reference layered alignment
49 59
50 DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" ) 60
51 MAX_SEQUENCE_SIZE = sys.maxint #Maximum length of sequence allowed 61 class RegionAlignment(object):
52 62
53 def __init__( self, size, species = [] ): 63 DNA_COMPLEMENT = maketrans("ACGTacgt", "TGCAtgca")
54 assert size <= self.MAX_SEQUENCE_SIZE, "Maximum length allowed for an individual sequence has been exceeded (%i > %i)." % ( size, self.MAX_SEQUENCE_SIZE ) 64
65 def __init__(self, size, species=[]):
55 self.size = size 66 self.size = size
56 self.sequences = {} 67 self.sequences = {}
57 if not isinstance( species, list ): 68 if not isinstance(species, list):
58 species = [species] 69 species = [species]
59 for spec in species: 70 for spec in species:
60 self.add_species( spec ) 71 self.add_species(spec)
61 72
62 #add a species to the alignment 73 # add a species to the alignment
63 def add_species( self, species ): 74 def add_species(self, species):
64 #make temporary sequence files 75 # make temporary sequence files
65 self.sequences[species] = tempfile.TemporaryFile() 76 self.sequences[species] = tempfile.TemporaryFile()
66 self.sequences[species].write( "-" * self.size ) 77 self.sequences[species].write("-" * self.size)
67 78
68 #returns the names for species found in alignment, skipping names as requested 79 # returns the names for species found in alignment, skipping names as requested
69 def get_species_names( self, skip = [] ): 80 def get_species_names(self, skip=[]):
70 if not isinstance( skip, list ): skip = [skip] 81 if not isinstance(skip, list):
82 skip = [skip]
71 names = self.sequences.keys() 83 names = self.sequences.keys()
72 for name in skip: 84 for name in skip:
73 try: names.remove( name ) 85 try:
74 except: pass 86 names.remove(name)
87 except Exception:
88 pass
75 return names 89 return names
76 90
77 #returns the sequence for a species 91 # returns the sequence for a species
78 def get_sequence( self, species ): 92 def get_sequence(self, species):
79 self.sequences[species].seek( 0 ) 93 self.sequences[species].seek(0)
80 return self.sequences[species].read() 94 return self.sequences[species].read()
81 95
82 #returns the reverse complement of the sequence for a species 96 # returns the reverse complement of the sequence for a species
83 def get_sequence_reverse_complement( self, species ): 97 def get_sequence_reverse_complement(self, species):
84 complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )] 98 complement = [base for base in self.get_sequence(species).translate(self.DNA_COMPLEMENT)]
85 complement.reverse() 99 complement.reverse()
86 return "".join( complement ) 100 return "".join(complement)
87 101
88 #sets a position for a species 102 # sets a position for a species
89 def set_position( self, index, species, base ): 103 def set_position(self, index, species, base):
90 if len( base ) != 1: raise Exception( "A genomic position can only have a length of 1." ) 104 if len(base) != 1:
91 return self.set_range( index, species, base ) 105 raise Exception("A genomic position can only have a length of 1.")
92 #sets a range for a species 106 return self.set_range(index, species, base)
93 def set_range( self, index, species, bases ): 107 # sets a range for a species
94 if index >= self.size or index < 0: raise Exception( "Your index (%i) is out of range (0 - %i)." % ( index, self.size - 1 ) ) 108
95 if len( bases ) == 0: raise Exception( "A set of genomic positions can only have a positive length." ) 109 def set_range(self, index, species, bases):
96 if species not in self.sequences.keys(): self.add_species( species ) 110 if index >= self.size or index < 0:
97 self.sequences[species].seek( index ) 111 raise Exception("Your index (%i) is out of range (0 - %i)." % (index, self.size - 1))
98 self.sequences[species].write( bases ) 112 if len(bases) == 0:
99 113 raise Exception("A set of genomic positions can only have a positive length.")
100 #Flush temp file of specified species, or all species 114 if species not in self.sequences.keys():
101 def flush( self, species = None ): 115 self.add_species(species)
116 self.sequences[species].seek(index)
117 self.sequences[species].write(bases)
118
119 # Flush temp file of specified species, or all species
120 def flush(self, species=None):
102 if species is None: 121 if species is None:
103 species = self.sequences.keys() 122 species = self.sequences.keys()
104 elif not isinstance( species, list ): 123 elif not isinstance(species, list):
105 species = [species] 124 species = [species]
106 for spec in species: 125 for spec in species:
107 self.sequences[spec].flush() 126 self.sequences[spec].flush()
108 127
109 class GenomicRegionAlignment( RegionAlignment ): 128
110 129 class GenomicRegionAlignment(RegionAlignment):
111 def __init__( self, start, end, species = [] ): 130
112 RegionAlignment.__init__( self, end - start, species ) 131 def __init__(self, start, end, species=[]):
132 RegionAlignment.__init__(self, end - start, species)
113 self.start = start 133 self.start = start
114 self.end = end 134 self.end = end
115 135
116 class SplicedAlignment( object ): 136
117 137 class SplicedAlignment(object):
118 DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" ) 138
119 139 DNA_COMPLEMENT = maketrans("ACGTacgt", "TGCAtgca")
120 def __init__( self, exon_starts, exon_ends, species = [] ): 140
121 if not isinstance( exon_starts, list ): 141 def __init__(self, exon_starts, exon_ends, species=[]):
142 if not isinstance(exon_starts, list):
122 exon_starts = [exon_starts] 143 exon_starts = [exon_starts]
123 if not isinstance( exon_ends, list ): 144 if not isinstance(exon_ends, list):
124 exon_ends = [exon_ends] 145 exon_ends = [exon_ends]
125 assert len( exon_starts ) == len( exon_ends ), "The number of starts does not match the number of sizes." 146 assert len(exon_starts) == len(exon_ends), "The number of starts does not match the number of sizes."
126 self.exons = [] 147 self.exons = []
127 for i in range( len( exon_starts ) ): 148 for i in range(len(exon_starts)):
128 self.exons.append( GenomicRegionAlignment( exon_starts[i], exon_ends[i], species ) ) 149 self.exons.append(GenomicRegionAlignment(exon_starts[i], exon_ends[i], species))
129 150
130 #returns the names for species found in alignment, skipping names as requested 151 # returns the names for species found in alignment, skipping names as requested
131 def get_species_names( self, skip = [] ): 152 def get_species_names(self, skip=[]):
132 if not isinstance( skip, list ): skip = [skip] 153 if not isinstance(skip, list):
154 skip = [skip]
133 names = [] 155 names = []
134 for exon in self.exons: 156 for exon in self.exons:
135 for name in exon.get_species_names( skip = skip ): 157 for name in exon.get_species_names(skip=skip):
136 if name not in names: 158 if name not in names:
137 names.append( name ) 159 names.append(name)
138 return names 160 return names
139 161
140 #returns the sequence for a species 162 # returns the sequence for a species
141 def get_sequence( self, species ): 163 def get_sequence(self, species):
142 sequence = tempfile.TemporaryFile() 164 sequence = tempfile.TemporaryFile()
143 for exon in self.exons: 165 for exon in self.exons:
144 if species in exon.get_species_names(): 166 if species in exon.get_species_names():
145 sequence.write( exon.get_sequence( species ) ) 167 sequence.write(exon.get_sequence(species))
146 else: 168 else:
147 sequence.write( "-" * exon.size ) 169 sequence.write("-" * exon.size)
148 sequence.seek( 0 ) 170 sequence.seek(0)
149 return sequence.read() 171 return sequence.read()
150 172
151 #returns the reverse complement of the sequence for a species 173 # returns the reverse complement of the sequence for a species
152 def get_sequence_reverse_complement( self, species ): 174 def get_sequence_reverse_complement(self, species):
153 complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )] 175 complement = [base for base in self.get_sequence(species).translate(self.DNA_COMPLEMENT)]
154 complement.reverse() 176 complement.reverse()
155 return "".join( complement ) 177 return "".join(complement)
156 178
157 #Start and end of coding region 179 # Start and end of coding region
158 @property 180 @property
159 def start( self ): 181 def start(self):
160 return self.exons[0].start 182 return self.exons[0].start
183
161 @property 184 @property
162 def end( self ): 185 def end(self):
163 return self.exons[-1].end 186 return self.exons[-1].end
164 187
165 #Open a MAF index using a UID 188 # Open a MAF index using a UID
166 def maf_index_by_uid( maf_uid, index_location_file ): 189
167 for line in open( index_location_file ): 190
191 def maf_index_by_uid(maf_uid, index_location_file):
192 for line in open(index_location_file):
168 try: 193 try:
169 #read each line, if not enough fields, go to next line 194 # read each line, if not enough fields, go to next line
170 if line[0:1] == "#" : continue 195 if line[0:1] == "#":
196 continue
171 fields = line.split('\t') 197 fields = line.split('\t')
172 if maf_uid == fields[1]: 198 if maf_uid == fields[1]:
173 try: 199 try:
174 maf_files = fields[4].replace( "\n", "" ).replace( "\r", "" ).split( "," ) 200 maf_files = fields[4].replace("\n", "").replace("\r", "").split(",")
175 return bx.align.maf.MultiIndexed( maf_files, keep_open = True, parse_e_rows = False ) 201 return bx.align.maf.MultiIndexed(maf_files, keep_open=True, parse_e_rows=False)
176 except Exception, e: 202 except Exception as e:
177 raise Exception( 'MAF UID (%s) found, but configuration appears to be malformed: %s' % ( maf_uid, e ) ) 203 raise Exception('MAF UID (%s) found, but configuration appears to be malformed: %s' % (maf_uid, e))
178 except: 204 except Exception:
179 pass 205 pass
180 return None 206 return None
181 207
182 #return ( index, temp_index_filename ) for user maf, if available, or build one and return it, return None when no tempfile is created 208 # return ( index, temp_index_filename ) for user maf, if available, or build one and return it, return None when no tempfile is created
183 def open_or_build_maf_index( maf_file, index_filename, species = None ): 209
210
211 def open_or_build_maf_index(maf_file, index_filename, species=None):
184 try: 212 try:
185 return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), None ) 213 return (bx.align.maf.Indexed(maf_file, index_filename=index_filename, keep_open=True, parse_e_rows=False), None)
186 except: 214 except Exception:
187 return build_maf_index( maf_file, species = species ) 215 return build_maf_index(maf_file, species=species)
188 216
189 def build_maf_index_species_chromosomes( filename, index_species = None ): 217
218 def build_maf_index_species_chromosomes(filename, index_species=None):
190 species = [] 219 species = []
191 species_chromosomes = {} 220 species_chromosomes = {}
192 indexes = bx.interval_index_file.Indexes() 221 indexes = bx.interval_index_file.Indexes()
193 blocks = 0 222 blocks = 0
194 try: 223 try:
195 maf_reader = bx.align.maf.Reader( open( filename ) ) 224 maf_reader = bx.align.maf.Reader(open(filename))
196 while True: 225 while True:
197 pos = maf_reader.file.tell() 226 pos = maf_reader.file.tell()
198 block = maf_reader.next() 227 block = maf_reader.next()
199 if block is None: 228 if block is None:
200 break 229 break
201 blocks += 1 230 blocks += 1
202 for c in block.components: 231 for c in block.components:
203 spec = c.src 232 spec = c.src
204 chrom = None 233 chrom = None
205 if "." in spec: 234 if "." in spec:
206 spec, chrom = spec.split( ".", 1 ) 235 spec, chrom = spec.split(".", 1)
207 if spec not in species: 236 if spec not in species:
208 species.append( spec ) 237 species.append(spec)
209 species_chromosomes[spec] = [] 238 species_chromosomes[spec] = []
210 if chrom and chrom not in species_chromosomes[spec]: 239 if chrom and chrom not in species_chromosomes[spec]:
211 species_chromosomes[spec].append( chrom ) 240 species_chromosomes[spec].append(chrom)
212 if index_species is None or spec in index_species: 241 if index_species is None or spec in index_species:
213 forward_strand_start = c.forward_strand_start 242 forward_strand_start = c.forward_strand_start
214 forward_strand_end = c.forward_strand_end 243 forward_strand_end = c.forward_strand_end
215 try: 244 try:
216 forward_strand_start = int( forward_strand_start ) 245 forward_strand_start = int(forward_strand_start)
217 forward_strand_end = int( forward_strand_end ) 246 forward_strand_end = int(forward_strand_end)
218 except ValueError: 247 except ValueError:
219 continue #start and end are not integers, can't add component to index, goto next component 248 continue # start and end are not integers, can't add component to index, goto next component
220 #this likely only occurs when parse_e_rows is True? 249 # this likely only occurs when parse_e_rows is True?
221 #could a species exist as only e rows? should the 250 # could a species exist as only e rows? should the
222 if forward_strand_end > forward_strand_start: 251 if forward_strand_end > forward_strand_start:
223 #require positive length; i.e. certain lines have start = end = 0 and cannot be indexed 252 # require positive length; i.e. certain lines have start = end = 0 and cannot be indexed
224 indexes.add( c.src, forward_strand_start, forward_strand_end, pos, max=c.src_size ) 253 indexes.add(c.src, forward_strand_start, forward_strand_end, pos, max=c.src_size)
225 except Exception, e: 254 except Exception as e:
226 #most likely a bad MAF 255 # most likely a bad MAF
227 log.debug( 'Building MAF index on %s failed: %s' % ( filename, e ) ) 256 log.debug('Building MAF index on %s failed: %s' % (filename, e))
228 return ( None, [], {}, 0 ) 257 return (None, [], {}, 0)
229 return ( indexes, species, species_chromosomes, blocks ) 258 return (indexes, species, species_chromosomes, blocks)
230 259
231 #builds and returns ( index, index_filename ) for specified maf_file 260 # builds and returns ( index, index_filename ) for specified maf_file
232 def build_maf_index( maf_file, species = None ): 261
233 indexes, found_species, species_chromosomes, blocks = build_maf_index_species_chromosomes( maf_file, species ) 262
263 def build_maf_index(maf_file, species=None):
264 indexes, found_species, species_chromosomes, blocks = build_maf_index_species_chromosomes(maf_file, species)
234 if indexes is not None: 265 if indexes is not None:
235 fd, index_filename = tempfile.mkstemp() 266 fd, index_filename = tempfile.mkstemp()
236 out = os.fdopen( fd, 'w' ) 267 out = os.fdopen(fd, 'w')
237 indexes.write( out ) 268 indexes.write(out)
238 out.close() 269 out.close()
239 return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), index_filename ) 270 return (bx.align.maf.Indexed(maf_file, index_filename=index_filename, keep_open=True, parse_e_rows=False), index_filename)
240 return ( None, None ) 271 return (None, None)
241 272
242 def component_overlaps_region( c, region ): 273
243 if c is None: return False 274 def component_overlaps_region(c, region):
275 if c is None:
276 return False
244 start, end = c.get_forward_strand_start(), c.get_forward_strand_end() 277 start, end = c.get_forward_strand_start(), c.get_forward_strand_end()
245 if region.start >= end or region.end <= start: 278 if region.start >= end or region.end <= start:
246 return False 279 return False
247 return True 280 return True
248 281
249 def chop_block_by_region( block, src, region, species = None, mincols = 0 ): 282
283 def chop_block_by_region(block, src, region, species=None, mincols=0):
250 # This chopping method was designed to maintain consistency with how start/end padding gaps have been working in Galaxy thus far: 284 # This chopping method was designed to maintain consistency with how start/end padding gaps have been working in Galaxy thus far:
251 # behavior as seen when forcing blocks to be '+' relative to src sequence (ref) and using block.slice_by_component( ref, slice_start, slice_end ) 285 # behavior as seen when forcing blocks to be '+' relative to src sequence (ref) and using block.slice_by_component( ref, slice_start, slice_end )
252 # whether-or-not this is the 'correct' behavior is questionable, but this will at least maintain consistency 286 # whether-or-not this is the 'correct' behavior is questionable, but this will at least maintain consistency
253 # comments welcome 287 # comments welcome
254 slice_start = block.text_size #max for the min() 288 slice_start = block.text_size # max for the min()
255 slice_end = 0 #min for the max() 289 slice_end = 0 # min for the max()
256 old_score = block.score #save old score for later use 290 old_score = block.score # save old score for later use
257 # We no longer assume only one occurance of src per block, so we need to check them all 291 # We no longer assume only one occurance of src per block, so we need to check them all
258 for c in iter_components_by_src( block, src ): 292 for c in iter_components_by_src(block, src):
259 if component_overlaps_region( c, region ): 293 if component_overlaps_region(c, region):
260 if c.text is not None: 294 if c.text is not None:
261 rev_strand = False 295 rev_strand = False
262 if c.strand == "-": 296 if c.strand == "-":
263 #We want our coord_to_col coordinates to be returned from positive stranded component 297 # We want our coord_to_col coordinates to be returned from positive stranded component
264 rev_strand = True 298 rev_strand = True
265 c = c.reverse_complement() 299 c = c.reverse_complement()
266 start = max( region.start, c.start ) 300 start = max(region.start, c.start)
267 end = min( region.end, c.end ) 301 end = min(region.end, c.end)
268 start = c.coord_to_col( start ) 302 start = c.coord_to_col(start)
269 end = c.coord_to_col( end ) 303 end = c.coord_to_col(end)
270 if rev_strand: 304 if rev_strand:
271 #need to orient slice coordinates to the original block direction 305 # need to orient slice coordinates to the original block direction
272 slice_len = end - start 306 slice_len = end - start
273 end = len( c.text ) - start 307 end = len(c.text) - start
274 start = end - slice_len 308 start = end - slice_len
275 slice_start = min( start, slice_start ) 309 slice_start = min(start, slice_start)
276 slice_end = max( end, slice_end ) 310 slice_end = max(end, slice_end)
277 311
278 if slice_start < slice_end: 312 if slice_start < slice_end:
279 block = block.slice( slice_start, slice_end ) 313 block = block.slice(slice_start, slice_end)
280 if block.text_size > mincols: 314 if block.text_size > mincols:
281 # restore old score, may not be accurate, but it is better than 0 for everything? 315 # restore old score, may not be accurate, but it is better than 0 for everything?
282 block.score = old_score 316 block.score = old_score
283 if species is not None: 317 if species is not None:
284 block = block.limit_to_species( species ) 318 block = block.limit_to_species(species)
285 block.remove_all_gap_columns() 319 block.remove_all_gap_columns()
286 return block 320 return block
287 return None 321 return None
288 322
289 def orient_block_by_region( block, src, region, force_strand = None ): 323
290 #loop through components matching src, 324 def orient_block_by_region(block, src, region, force_strand=None):
291 #make sure each of these components overlap region 325 # loop through components matching src,
292 #cache strand for each of overlaping regions 326 # make sure each of these components overlap region
293 #if force_strand / region.strand not in strand cache, reverse complement 327 # cache strand for each of overlaping regions
294 ### we could have 2 sequences with same src, overlapping region, on different strands, this would cause no reverse_complementing 328 # if force_strand / region.strand not in strand cache, reverse complement
295 strands = [ c.strand for c in iter_components_by_src( block, src ) if component_overlaps_region( c, region ) ] 329 # we could have 2 sequences with same src, overlapping region, on different strands, this would cause no reverse_complementing
296 if strands and ( force_strand is None and region.strand not in strands ) or ( force_strand is not None and force_strand not in strands ): 330 strands = [c.strand for c in iter_components_by_src(block, src) if component_overlaps_region(c, region)]
331 if strands and (force_strand is None and region.strand not in strands) or (force_strand is not None and force_strand not in strands):
297 block = block.reverse_complement() 332 block = block.reverse_complement()
298 return block 333 return block
299 334
300 def get_oriented_chopped_blocks_for_region( index, src, region, species = None, mincols = 0, force_strand = None ): 335
301 for block, idx, offset in get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols, force_strand ): 336 def get_oriented_chopped_blocks_for_region(index, src, region, species=None, mincols=0, force_strand=None):
337 for block, idx, offset in get_oriented_chopped_blocks_with_index_offset_for_region(index, src, region, species, mincols, force_strand):
302 yield block 338 yield block
303 def get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0, force_strand = None ): 339
304 for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ): 340
305 yield orient_block_by_region( block, src, region, force_strand ), idx, offset 341 def get_oriented_chopped_blocks_with_index_offset_for_region(index, src, region, species=None, mincols=0, force_strand=None):
306 342 for block, idx, offset in get_chopped_blocks_with_index_offset_for_region(index, src, region, species, mincols):
307 #split a block with multiple occurances of src into one block per src 343 yield orient_block_by_region(block, src, region, force_strand), idx, offset
308 def iter_blocks_split_by_src( block, src ): 344
309 for src_c in iter_components_by_src( block, src ): 345 # split a block with multiple occurances of src into one block per src
310 new_block = bx.align.Alignment( score=block.score, attributes=deepcopy( block.attributes ) ) 346
347
348 def iter_blocks_split_by_src(block, src):
349 for src_c in iter_components_by_src(block, src):
350 new_block = bx.align.Alignment(score=block.score, attributes=deepcopy(block.attributes))
311 new_block.text_size = block.text_size 351 new_block.text_size = block.text_size
312 for c in block.components: 352 for c in block.components:
313 if c == src_c or c.src != src: 353 if c == src_c or c.src != src:
314 new_block.add_component( deepcopy( c ) ) #components have reference to alignment, dont want to loose reference to original alignment block in original components 354 new_block.add_component(deepcopy(c)) # components have reference to alignment, dont want to loose reference to original alignment block in original components
315 yield new_block 355 yield new_block
316 356
317 #split a block into multiple blocks with all combinations of a species appearing only once per block 357 # split a block into multiple blocks with all combinations of a species appearing only once per block
318 def iter_blocks_split_by_species( block, species = None ): 358
319 def __split_components_by_species( components_by_species, new_block ): 359
360 def iter_blocks_split_by_species(block, species=None):
361 def __split_components_by_species(components_by_species, new_block):
320 if components_by_species: 362 if components_by_species:
321 #more species with components to add to this block 363 # more species with components to add to this block
322 components_by_species = deepcopy( components_by_species ) 364 components_by_species = deepcopy(components_by_species)
323 spec_comps = components_by_species.pop( 0 ) 365 spec_comps = components_by_species.pop(0)
324 for c in spec_comps: 366 for c in spec_comps:
325 newer_block = deepcopy( new_block ) 367 newer_block = deepcopy(new_block)
326 newer_block.add_component( deepcopy( c ) ) 368 newer_block.add_component(deepcopy(c))
327 for value in __split_components_by_species( components_by_species, newer_block ): 369 for value in __split_components_by_species(components_by_species, newer_block):
328 yield value 370 yield value
329 else: 371 else:
330 #no more components to add, yield this block 372 # no more components to add, yield this block
331 yield new_block 373 yield new_block
332 374
333 #divide components by species 375 # divide components by species
334 spec_dict = {} 376 spec_dict = {}
335 if not species: 377 if not species:
336 species = [] 378 species = []
337 for c in block.components: 379 for c in block.components:
338 spec, chrom = src_split( c.src ) 380 spec, chrom = src_split(c.src)
339 if spec not in spec_dict: 381 if spec not in spec_dict:
340 spec_dict[ spec ] = [] 382 spec_dict[spec] = []
341 species.append( spec ) 383 species.append(spec)
342 spec_dict[ spec ].append( c ) 384 spec_dict[spec].append(c)
343 else: 385 else:
344 for spec in species: 386 for spec in species:
345 spec_dict[ spec ] = [] 387 spec_dict[spec] = []
346 for c in iter_components_by_src_start( block, spec ): 388 for c in iter_components_by_src_start(block, spec):
347 spec_dict[ spec ].append( c ) 389 spec_dict[spec].append(c)
348 390
349 empty_block = bx.align.Alignment( score=block.score, attributes=deepcopy( block.attributes ) ) #should we copy attributes? 391 empty_block = bx.align.Alignment(score=block.score, attributes=deepcopy(block.attributes)) # should we copy attributes?
350 empty_block.text_size = block.text_size 392 empty_block.text_size = block.text_size
351 #call recursive function to split into each combo of spec/blocks 393 # call recursive function to split into each combo of spec/blocks
352 for value in __split_components_by_species( spec_dict.values(), empty_block ): 394 for value in __split_components_by_species(spec_dict.values(), empty_block):
353 sort_block_components_by_block( value, block ) #restore original component order 395 sort_block_components_by_block(value, block) # restore original component order
354 yield value 396 yield value
355 397
356 398
357 #generator yielding only chopped and valid blocks for a specified region 399 # generator yielding only chopped and valid blocks for a specified region
358 def get_chopped_blocks_for_region( index, src, region, species = None, mincols = 0 ): 400 def get_chopped_blocks_for_region(index, src, region, species=None, mincols=0):
359 for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ): 401 for block, idx, offset in get_chopped_blocks_with_index_offset_for_region(index, src, region, species, mincols):
360 yield block 402 yield block
361 def get_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0 ): 403
362 for block, idx, offset in index.get_as_iterator_with_index_and_offset( src, region.start, region.end ): 404
363 block = chop_block_by_region( block, src, region, species, mincols ) 405 def get_chopped_blocks_with_index_offset_for_region(index, src, region, species=None, mincols=0):
406 for block, idx, offset in index.get_as_iterator_with_index_and_offset(src, region.start, region.end):
407 block = chop_block_by_region(block, src, region, species, mincols)
364 if block is not None: 408 if block is not None:
365 yield block, idx, offset 409 yield block, idx, offset
366 410
367 #returns a filled region alignment for specified regions 411 # returns a filled region alignment for specified regions
368 def get_region_alignment( index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ): 412
369 if species is not None: alignment = RegionAlignment( end - start, species ) 413
370 else: alignment = RegionAlignment( end - start, primary_species ) 414 def get_region_alignment(index, primary_species, chrom, start, end, strand='+', species=None, mincols=0, overwrite_with_gaps=True):
371 return fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand, species, mincols, overwrite_with_gaps ) 415 if species is not None:
372 416 alignment = RegionAlignment(end - start, species)
373 #reduces a block to only positions exisiting in the src provided 417 else:
374 def reduce_block_by_primary_genome( block, species, chromosome, region_start ): 418 alignment = RegionAlignment(end - start, primary_species)
375 #returns ( startIndex, {species:texts} 419 return fill_region_alignment(alignment, index, primary_species, chrom, start, end, strand, species, mincols, overwrite_with_gaps)
376 #where texts' contents are reduced to only positions existing in the primary genome 420
377 src = "%s.%s" % ( species, chromosome ) 421 # reduces a block to only positions exisiting in the src provided
378 ref = block.get_component_by_src( src ) 422
423
424 def reduce_block_by_primary_genome(block, species, chromosome, region_start):
425 # returns ( startIndex, {species:texts}
426 # where texts' contents are reduced to only positions existing in the primary genome
427 src = "%s.%s" % (species, chromosome)
428 ref = block.get_component_by_src(src)
379 start_offset = ref.start - region_start 429 start_offset = ref.start - region_start
380 species_texts = {} 430 species_texts = {}
381 for c in block.components: 431 for c in block.components:
382 species_texts[ c.src.split( '.' )[0] ] = list( c.text ) 432 species_texts[c.src.split('.')[0]] = list(c.text)
383 #remove locations which are gaps in the primary species, starting from the downstream end 433 # remove locations which are gaps in the primary species, starting from the downstream end
384 for i in range( len( species_texts[ species ] ) - 1, -1, -1 ): 434 for i in range(len(species_texts[species]) - 1, -1, -1):
385 if species_texts[ species ][i] == '-': 435 if species_texts[species][i] == '-':
386 for text in species_texts.values(): 436 for text in species_texts.values():
387 text.pop( i ) 437 text.pop(i)
388 for spec, text in species_texts.items(): 438 for spec, text in species_texts.items():
389 species_texts[spec] = ''.join( text ) 439 species_texts[spec] = ''.join(text)
390 return ( start_offset, species_texts ) 440 return (start_offset, species_texts)
391 441
392 #fills a region alignment 442 # fills a region alignment
393 def fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ): 443
394 region = bx.intervals.Interval( start, end ) 444
445 def fill_region_alignment(alignment, index, primary_species, chrom, start, end, strand='+', species=None, mincols=0, overwrite_with_gaps=True):
446 region = bx.intervals.Interval(start, end)
395 region.chrom = chrom 447 region.chrom = chrom
396 region.strand = strand 448 region.strand = strand
397 primary_src = "%s.%s" % ( primary_species, chrom ) 449 primary_src = "%s.%s" % (primary_species, chrom)
398 450
399 #Order blocks overlaping this position by score, lowest first 451 # Order blocks overlaping this position by score, lowest first
400 blocks = [] 452 blocks = []
401 for block, idx, offset in index.get_as_iterator_with_index_and_offset( primary_src, start, end ): 453 for block, idx, offset in index.get_as_iterator_with_index_and_offset(primary_src, start, end):
402 score = float( block.score ) 454 score = float(block.score)
403 for i in range( 0, len( blocks ) ): 455 for i in range(0, len(blocks)):
404 if score < blocks[i][0]: 456 if score < blocks[i][0]:
405 blocks.insert( i, ( score, idx, offset ) ) 457 blocks.insert(i, (score, idx, offset))
406 break 458 break
407 else: 459 else:
408 blocks.append( ( score, idx, offset ) ) 460 blocks.append((score, idx, offset))
409 461
410 #gap_chars_tuple = tuple( GAP_CHARS ) 462 # gap_chars_tuple = tuple( GAP_CHARS )
411 gap_chars_str = ''.join( GAP_CHARS ) 463 gap_chars_str = ''.join(GAP_CHARS)
412 #Loop through ordered blocks and layer by increasing score 464 # Loop through ordered blocks and layer by increasing score
413 for block_dict in blocks: 465 for block_dict in blocks:
414 for block in iter_blocks_split_by_species( block_dict[1].get_at_offset( block_dict[2] ) ): #need to handle each occurance of sequence in block seperately 466 for block in iter_blocks_split_by_species(block_dict[1].get_at_offset(block_dict[2])): # need to handle each occurance of sequence in block seperately
415 if component_overlaps_region( block.get_component_by_src( primary_src ), region ): 467 if component_overlaps_region(block.get_component_by_src(primary_src), region):
416 block = chop_block_by_region( block, primary_src, region, species, mincols ) #chop block 468 block = chop_block_by_region(block, primary_src, region, species, mincols) # chop block
417 block = orient_block_by_region( block, primary_src, region ) #orient block 469 block = orient_block_by_region(block, primary_src, region) # orient block
418 start_offset, species_texts = reduce_block_by_primary_genome( block, primary_species, chrom, start ) 470 start_offset, species_texts = reduce_block_by_primary_genome(block, primary_species, chrom, start)
419 for spec, text in species_texts.items(): 471 for spec, text in species_texts.items():
420 #we should trim gaps from both sides, since these are not positions in this species genome (sequence) 472 # we should trim gaps from both sides, since these are not positions in this species genome (sequence)
421 text = text.rstrip( gap_chars_str ) 473 text = text.rstrip(gap_chars_str)
422 gap_offset = 0 474 gap_offset = 0
423 while True in [ text.startswith( gap_char ) for gap_char in GAP_CHARS ]: #python2.4 doesn't accept a tuple for .startswith() 475 while True in [text.startswith(gap_char) for gap_char in GAP_CHARS]: # python2.4 doesn't accept a tuple for .startswith()
424 #while text.startswith( gap_chars_tuple ): 476 # while text.startswith( gap_chars_tuple ):
425 gap_offset += 1 477 gap_offset += 1
426 text = text[1:] 478 text = text[1:]
427 if not text: 479 if not text:
428 break 480 break
429 if text: 481 if text:
430 if overwrite_with_gaps: 482 if overwrite_with_gaps:
431 alignment.set_range( start_offset + gap_offset, spec, text ) 483 alignment.set_range(start_offset + gap_offset, spec, text)
432 else: 484 else:
433 for i, char in enumerate( text ): 485 for i, char in enumerate(text):
434 if char not in GAP_CHARS: 486 if char not in GAP_CHARS:
435 alignment.set_position( start_offset + gap_offset + i, spec, char ) 487 alignment.set_position(start_offset + gap_offset + i, spec, char)
436 return alignment 488 return alignment
437 489
438 #returns a filled spliced region alignment for specified region with start and end lists 490 # returns a filled spliced region alignment for specified region with start and end lists
439 def get_spliced_region_alignment( index, primary_species, chrom, starts, ends, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ): 491
440 #create spliced alignment object 492
441 if species is not None: alignment = SplicedAlignment( starts, ends, species ) 493 def get_spliced_region_alignment(index, primary_species, chrom, starts, ends, strand='+', species=None, mincols=0, overwrite_with_gaps=True):
442 else: alignment = SplicedAlignment( starts, ends, [primary_species] ) 494 # create spliced alignment object
495 if species is not None:
496 alignment = SplicedAlignment(starts, ends, species)
497 else:
498 alignment = SplicedAlignment(starts, ends, [primary_species])
443 for exon in alignment.exons: 499 for exon in alignment.exons:
444 fill_region_alignment( exon, index, primary_species, chrom, exon.start, exon.end, strand, species, mincols, overwrite_with_gaps ) 500 fill_region_alignment(exon, index, primary_species, chrom, exon.start, exon.end, strand, species, mincols, overwrite_with_gaps)
445 return alignment 501 return alignment
446 502
447 #loop through string array, only return non-commented lines 503 # loop through string array, only return non-commented lines
448 def line_enumerator( lines, comment_start = '#' ): 504
505
506 def line_enumerator(lines, comment_start='#'):
449 i = 0 507 i = 0
450 for line in lines: 508 for line in lines:
451 if not line.startswith( comment_start ): 509 if not line.startswith(comment_start):
452 i += 1 510 i += 1
453 yield ( i, line ) 511 yield (i, line)
454 512
455 #read a GeneBed file, return list of starts, ends, raw fields 513 # read a GeneBed file, return list of starts, ends, raw fields
456 def get_starts_ends_fields_from_gene_bed( line ): 514
457 #Starts and ends for exons 515
516 def get_starts_ends_fields_from_gene_bed(line):
517 # Starts and ends for exons
458 starts = [] 518 starts = []
459 ends = [] 519 ends = []
460 520
461 fields = line.split() 521 fields = line.split()
462 #Requires atleast 12 BED columns 522 # Requires atleast 12 BED columns
463 if len(fields) < 12: 523 if len(fields) < 12:
464 raise Exception( "Not a proper 12 column BED line (%s)." % line ) 524 raise Exception("Not a proper 12 column BED line (%s)." % line)
465 chrom = fields[0] 525 tx_start = int(fields[1])
466 tx_start = int( fields[1] ) 526 strand = fields[5]
467 tx_end = int( fields[2] ) 527 if strand != '-':
468 name = fields[3] 528 strand = '+' # Default strand is +
469 strand = fields[5] 529 cds_start = int(fields[6])
470 if strand != '-': strand='+' #Default strand is + 530 cds_end = int(fields[7])
471 cds_start = int( fields[6] ) 531
472 cds_end = int( fields[7] ) 532 # Calculate and store starts and ends of coding exons
473
474 #Calculate and store starts and ends of coding exons
475 region_start, region_end = cds_start, cds_end 533 region_start, region_end = cds_start, cds_end
476 exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) ) 534 exon_starts = map(int, fields[11].rstrip(',\n').split(','))
477 exon_starts = map( ( lambda x: x + tx_start ), exon_starts ) 535 exon_starts = map((lambda x: x + tx_start), exon_starts)
478 exon_ends = map( int, fields[10].rstrip( ',' ).split( ',' ) ) 536 exon_ends = map(int, fields[10].rstrip(',').split(','))
479 exon_ends = map( ( lambda x, y: x + y ), exon_starts, exon_ends ); 537 exon_ends = map((lambda x, y: x + y), exon_starts, exon_ends)
480 for start, end in zip( exon_starts, exon_ends ): 538 for start, end in zip(exon_starts, exon_ends):
481 start = max( start, region_start ) 539 start = max(start, region_start)
482 end = min( end, region_end ) 540 end = min(end, region_end)
483 if start < end: 541 if start < end:
484 starts.append( start ) 542 starts.append(start)
485 ends.append( end ) 543 ends.append(end)
486 return ( starts, ends, fields ) 544 return (starts, ends, fields)
487 545
488 def iter_components_by_src( block, src ): 546
547 def iter_components_by_src(block, src):
489 for c in block.components: 548 for c in block.components:
490 if c.src == src: 549 if c.src == src:
491 yield c 550 yield c
492 551
493 def get_components_by_src( block, src ): 552
494 return [ value for value in iter_components_by_src( block, src ) ] 553 def get_components_by_src(block, src):
495 554 return [value for value in iter_components_by_src(block, src)]
496 def iter_components_by_src_start( block, src ): 555
556
557 def iter_components_by_src_start(block, src):
497 for c in block.components: 558 for c in block.components:
498 if c.src.startswith( src ): 559 if c.src.startswith(src):
499 yield c 560 yield c
500 561
501 def get_components_by_src_start( block, src ): 562
502 return [ value for value in iter_components_by_src_start( block, src ) ] 563 def get_components_by_src_start(block, src):
503 564 return [value for value in iter_components_by_src_start(block, src)]
504 def sort_block_components_by_block( block1, block2 ): 565
505 #orders the components in block1 by the index of the component in block2 566
506 #block1 must be a subset of block2 567 def sort_block_components_by_block(block1, block2):
507 #occurs in-place 568 # orders the components in block1 by the index of the component in block2
508 return block1.components.sort( cmp = lambda x, y: block2.components.index( x ) - block2.components.index( y ) ) 569 # block1 must be a subset of block2
509 570 # occurs in-place
510 def get_species_in_maf( maf_filename ): 571 return block1.components.sort(cmp=lambda x, y: block2.components.index(x) - block2.components.index(y))
572
573
574 def get_species_in_maf(maf_filename):
511 species = [] 575 species = []
512 for block in bx.align.maf.Reader( open( maf_filename ) ): 576 for block in bx.align.maf.Reader(open(maf_filename)):
513 for spec in get_species_in_block( block ): 577 for spec in get_species_in_block(block):
514 if spec not in species: 578 if spec not in species:
515 species.append( spec ) 579 species.append(spec)
516 return species 580 return species
517 581
518 def parse_species_option( species ): 582
583 def parse_species_option(species):
519 if species: 584 if species:
520 species = species.split( ',' ) 585 species = species.split(',')
521 if 'None' not in species: 586 if 'None' not in species:
522 return species 587 return species
523 return None #provided species was '', None, or had 'None' in it 588 return None # provided species was '', None, or had 'None' in it
524 589
525 def remove_temp_index_file( index_filename ): 590
526 try: os.unlink( index_filename ) 591 def remove_temp_index_file(index_filename):
527 except: pass 592 try:
528 593 os.unlink(index_filename)
529 #Below are methods to deal with FASTA files 594 except Exception:
530 595 pass
531 def get_fasta_header( component, attributes = {}, suffix = None ): 596
532 header = ">%s(%s):%i-%i|" % ( component.src, component.strand, component.get_forward_strand_start(), component.get_forward_strand_end() ) 597 # Below are methods to deal with FASTA files
598
599
600 def get_fasta_header(component, attributes={}, suffix=None):
601 header = ">%s(%s):%i-%i|" % (component.src, component.strand, component.get_forward_strand_start(), component.get_forward_strand_end())
533 for key, value in attributes.iteritems(): 602 for key, value in attributes.iteritems():
534 header = "%s%s=%s|" % ( header, key, value ) 603 header = "%s%s=%s|" % (header, key, value)
535 if suffix: 604 if suffix:
536 header = "%s%s" % ( header, suffix ) 605 header = "%s%s" % (header, suffix)
537 else: 606 else:
538 header = "%s%s" % ( header, src_split( component.src )[ 0 ] ) 607 header = "%s%s" % (header, src_split(component.src)[0])
539 return header 608 return header
540 609
541 def get_attributes_from_fasta_header( header ): 610
542 if not header: return {} 611 def get_attributes_from_fasta_header(header):
612 if not header:
613 return {}
543 attributes = {} 614 attributes = {}
544 header = header.lstrip( '>' ) 615 header = header.lstrip('>')
545 header = header.strip() 616 header = header.strip()
546 fields = header.split( '|' ) 617 fields = header.split('|')
547 try: 618 try:
548 region = fields[0] 619 region = fields[0]
549 region = region.split( '(', 1 ) 620 region = region.split('(', 1)
550 temp = region[0].split( '.', 1 ) 621 temp = region[0].split('.', 1)
551 attributes['species'] = temp[0] 622 attributes['species'] = temp[0]
552 if len( temp ) == 2: 623 if len(temp) == 2:
553 attributes['chrom'] = temp[1] 624 attributes['chrom'] = temp[1]
554 else: 625 else:
555 attributes['chrom'] = temp[0] 626 attributes['chrom'] = temp[0]
556 region = region[1].split( ')', 1 ) 627 region = region[1].split(')', 1)
557 attributes['strand'] = region[0] 628 attributes['strand'] = region[0]
558 region = region[1].lstrip( ':' ).split( '-' ) 629 region = region[1].lstrip(':').split('-')
559 attributes['start'] = int( region[0] ) 630 attributes['start'] = int(region[0])
560 attributes['end'] = int( region[1] ) 631 attributes['end'] = int(region[1])
561 except: 632 except Exception:
562 #fields 0 is not a region coordinate 633 # fields 0 is not a region coordinate
563 pass 634 pass
564 if len( fields ) > 2: 635 if len(fields) > 2:
565 for i in xrange( 1, len( fields ) - 1 ): 636 for i in range(1, len(fields) - 1):
566 prop = fields[i].split( '=', 1 ) 637 prop = fields[i].split('=', 1)
567 if len( prop ) == 2: 638 if len(prop) == 2:
568 attributes[ prop[0] ] = prop[1] 639 attributes[prop[0]] = prop[1]
569 if len( fields ) > 1: 640 if len(fields) > 1:
570 attributes['__suffix__'] = fields[-1] 641 attributes['__suffix__'] = fields[-1]
571 return attributes 642 return attributes
572 643
573 def iter_fasta_alignment( filename ): 644
645 def iter_fasta_alignment(filename):
574 class fastaComponent: 646 class fastaComponent:
575 def __init__( self, species, text = "" ): 647 def __init__(self, species, text=""):
576 self.species = species 648 self.species = species
577 self.text = text 649 self.text = text
578 def extend( self, text ): 650
579 self.text = self.text + text.replace( '\n', '' ).replace( '\r', '' ).strip() 651 def extend(self, text):
580 #yields a list of fastaComponents for a FASTA file 652 self.text = self.text + text.replace('\n', '').replace('\r', '').strip()
581 f = open( filename, 'rb' ) 653 # yields a list of fastaComponents for a FASTA file
582 components = [] 654 with open(filename, 'r') as f:
583 #cur_component = None 655 components = []
584 while True: 656 # cur_component = None
585 line = f.readline() 657 while True:
586 if not line: 658 line = f.readline()
587 if components: 659 if not line:
588 yield components 660 if components:
589 return 661 yield components
590 line = line.strip() 662 return
591 if not line: 663 line = line.strip()
592 if components: 664 if not line:
593 yield components 665 if components:
594 components = [] 666 yield components
595 elif line.startswith( '>' ): 667 components = []
596 attributes = get_attributes_from_fasta_header( line ) 668 elif line.startswith('>'):
597 components.append( fastaComponent( attributes['species'] ) ) 669 attributes = get_attributes_from_fasta_header(line)
598 elif components: 670 components.append(fastaComponent(attributes['species']))
599 components[-1].extend( line ) 671 elif components:
600 672 components[-1].extend(line)