Mercurial > repos > devteam > fasta_concatenate_by_species
comparison utils/maf_utilities.py @ 2:16df616b39e5 draft
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/fasta_concatenate_by_species commit cd1ed08574b749eee2a3f6e6151dbb0c8ca15bbf"
author | devteam |
---|---|
date | Sun, 01 Mar 2020 07:24:26 -0500 |
parents | 717aee069681 |
children | 25b8736c627a |
comparison
equal
deleted
inserted
replaced
1:717aee069681 | 2:16df616b39e5 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 """ | 2 """ |
3 Provides wrappers and utilities for working with MAF files and alignments. | 3 Provides wrappers and utilities for working with MAF files and alignments. |
4 """ | 4 """ |
5 #Dan Blankenberg | 5 # Dan Blankenberg |
6 import bx.align.maf | 6 import bx.align.maf |
7 import bx.intervals | 7 import bx.intervals |
8 import bx.interval_index_file | 8 import bx.interval_index_file |
9 import sys, os, string, tempfile | 9 import sys |
10 import os | |
11 import tempfile | |
10 import logging | 12 import logging |
11 from copy import deepcopy | 13 from copy import deepcopy |
12 | 14 |
13 assert sys.version_info[:2] >= ( 2, 4 ) | 15 try: |
16 maketrans = str.maketrans | |
17 except AttributeError: | |
18 from string import maketrans | |
19 | |
20 assert sys.version_info[:2] >= (2, 4) | |
14 | 21 |
15 log = logging.getLogger(__name__) | 22 log = logging.getLogger(__name__) |
16 | 23 |
17 | 24 |
18 GAP_CHARS = [ '-' ] | 25 GAP_CHARS = ['-'] |
19 SRC_SPLIT_CHAR = '.' | 26 SRC_SPLIT_CHAR = '.' |
20 | 27 |
21 def src_split( src ): | 28 |
22 fields = src.split( SRC_SPLIT_CHAR, 1 ) | 29 def src_split(src): |
23 spec = fields.pop( 0 ) | 30 fields = src.split(SRC_SPLIT_CHAR, 1) |
31 spec = fields.pop(0) | |
24 if fields: | 32 if fields: |
25 chrom = fields.pop( 0 ) | 33 chrom = fields.pop(0) |
26 else: | 34 else: |
27 chrom = spec | 35 chrom = spec |
28 return spec, chrom | 36 return spec, chrom |
29 | 37 |
30 def src_merge( spec, chrom, contig = None ): | 38 |
31 if None in [ spec, chrom ]: | 39 def src_merge(spec, chrom, contig=None): |
40 if None in [spec, chrom]: | |
32 spec = chrom = spec or chrom | 41 spec = chrom = spec or chrom |
33 return bx.align.maf.src_merge( spec, chrom, contig ) | 42 return bx.align.maf.src_merge(spec, chrom, contig) |
34 | 43 |
35 def get_species_in_block( block ): | 44 |
45 def get_species_in_block(block): | |
36 species = [] | 46 species = [] |
37 for c in block.components: | 47 for c in block.components: |
38 spec, chrom = src_split( c.src ) | 48 spec, chrom = src_split(c.src) |
39 if spec not in species: | 49 if spec not in species: |
40 species.append( spec ) | 50 species.append(spec) |
41 return species | 51 return species |
42 | 52 |
43 def tool_fail( msg = "Unknown Error" ): | 53 |
44 print >> sys.stderr, "Fatal Error: %s" % msg | 54 def tool_fail(msg="Unknown Error"): |
45 sys.exit() | 55 msg = "Fatal Error: %s" % msg |
46 | 56 sys.exit(msg) |
47 #an object corresponding to a reference layered alignment | 57 |
48 class RegionAlignment( object ): | 58 # an object corresponding to a reference layered alignment |
49 | 59 |
50 DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" ) | 60 |
51 MAX_SEQUENCE_SIZE = sys.maxint #Maximum length of sequence allowed | 61 class RegionAlignment(object): |
52 | 62 |
53 def __init__( self, size, species = [] ): | 63 DNA_COMPLEMENT = maketrans("ACGTacgt", "TGCAtgca") |
54 assert size <= self.MAX_SEQUENCE_SIZE, "Maximum length allowed for an individual sequence has been exceeded (%i > %i)." % ( size, self.MAX_SEQUENCE_SIZE ) | 64 |
65 def __init__(self, size, species=[]): | |
55 self.size = size | 66 self.size = size |
56 self.sequences = {} | 67 self.sequences = {} |
57 if not isinstance( species, list ): | 68 if not isinstance(species, list): |
58 species = [species] | 69 species = [species] |
59 for spec in species: | 70 for spec in species: |
60 self.add_species( spec ) | 71 self.add_species(spec) |
61 | 72 |
62 #add a species to the alignment | 73 # add a species to the alignment |
63 def add_species( self, species ): | 74 def add_species(self, species): |
64 #make temporary sequence files | 75 # make temporary sequence files |
65 self.sequences[species] = tempfile.TemporaryFile() | 76 self.sequences[species] = tempfile.TemporaryFile() |
66 self.sequences[species].write( "-" * self.size ) | 77 self.sequences[species].write("-" * self.size) |
67 | 78 |
68 #returns the names for species found in alignment, skipping names as requested | 79 # returns the names for species found in alignment, skipping names as requested |
69 def get_species_names( self, skip = [] ): | 80 def get_species_names(self, skip=[]): |
70 if not isinstance( skip, list ): skip = [skip] | 81 if not isinstance(skip, list): |
82 skip = [skip] | |
71 names = self.sequences.keys() | 83 names = self.sequences.keys() |
72 for name in skip: | 84 for name in skip: |
73 try: names.remove( name ) | 85 try: |
74 except: pass | 86 names.remove(name) |
87 except Exception: | |
88 pass | |
75 return names | 89 return names |
76 | 90 |
77 #returns the sequence for a species | 91 # returns the sequence for a species |
78 def get_sequence( self, species ): | 92 def get_sequence(self, species): |
79 self.sequences[species].seek( 0 ) | 93 self.sequences[species].seek(0) |
80 return self.sequences[species].read() | 94 return self.sequences[species].read() |
81 | 95 |
82 #returns the reverse complement of the sequence for a species | 96 # returns the reverse complement of the sequence for a species |
83 def get_sequence_reverse_complement( self, species ): | 97 def get_sequence_reverse_complement(self, species): |
84 complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )] | 98 complement = [base for base in self.get_sequence(species).translate(self.DNA_COMPLEMENT)] |
85 complement.reverse() | 99 complement.reverse() |
86 return "".join( complement ) | 100 return "".join(complement) |
87 | 101 |
88 #sets a position for a species | 102 # sets a position for a species |
89 def set_position( self, index, species, base ): | 103 def set_position(self, index, species, base): |
90 if len( base ) != 1: raise Exception( "A genomic position can only have a length of 1." ) | 104 if len(base) != 1: |
91 return self.set_range( index, species, base ) | 105 raise Exception("A genomic position can only have a length of 1.") |
92 #sets a range for a species | 106 return self.set_range(index, species, base) |
93 def set_range( self, index, species, bases ): | 107 # sets a range for a species |
94 if index >= self.size or index < 0: raise Exception( "Your index (%i) is out of range (0 - %i)." % ( index, self.size - 1 ) ) | 108 |
95 if len( bases ) == 0: raise Exception( "A set of genomic positions can only have a positive length." ) | 109 def set_range(self, index, species, bases): |
96 if species not in self.sequences.keys(): self.add_species( species ) | 110 if index >= self.size or index < 0: |
97 self.sequences[species].seek( index ) | 111 raise Exception("Your index (%i) is out of range (0 - %i)." % (index, self.size - 1)) |
98 self.sequences[species].write( bases ) | 112 if len(bases) == 0: |
99 | 113 raise Exception("A set of genomic positions can only have a positive length.") |
100 #Flush temp file of specified species, or all species | 114 if species not in self.sequences.keys(): |
101 def flush( self, species = None ): | 115 self.add_species(species) |
116 self.sequences[species].seek(index) | |
117 self.sequences[species].write(bases) | |
118 | |
119 # Flush temp file of specified species, or all species | |
120 def flush(self, species=None): | |
102 if species is None: | 121 if species is None: |
103 species = self.sequences.keys() | 122 species = self.sequences.keys() |
104 elif not isinstance( species, list ): | 123 elif not isinstance(species, list): |
105 species = [species] | 124 species = [species] |
106 for spec in species: | 125 for spec in species: |
107 self.sequences[spec].flush() | 126 self.sequences[spec].flush() |
108 | 127 |
109 class GenomicRegionAlignment( RegionAlignment ): | 128 |
110 | 129 class GenomicRegionAlignment(RegionAlignment): |
111 def __init__( self, start, end, species = [] ): | 130 |
112 RegionAlignment.__init__( self, end - start, species ) | 131 def __init__(self, start, end, species=[]): |
132 RegionAlignment.__init__(self, end - start, species) | |
113 self.start = start | 133 self.start = start |
114 self.end = end | 134 self.end = end |
115 | 135 |
116 class SplicedAlignment( object ): | 136 |
117 | 137 class SplicedAlignment(object): |
118 DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" ) | 138 |
119 | 139 DNA_COMPLEMENT = maketrans("ACGTacgt", "TGCAtgca") |
120 def __init__( self, exon_starts, exon_ends, species = [] ): | 140 |
121 if not isinstance( exon_starts, list ): | 141 def __init__(self, exon_starts, exon_ends, species=[]): |
142 if not isinstance(exon_starts, list): | |
122 exon_starts = [exon_starts] | 143 exon_starts = [exon_starts] |
123 if not isinstance( exon_ends, list ): | 144 if not isinstance(exon_ends, list): |
124 exon_ends = [exon_ends] | 145 exon_ends = [exon_ends] |
125 assert len( exon_starts ) == len( exon_ends ), "The number of starts does not match the number of sizes." | 146 assert len(exon_starts) == len(exon_ends), "The number of starts does not match the number of sizes." |
126 self.exons = [] | 147 self.exons = [] |
127 for i in range( len( exon_starts ) ): | 148 for i in range(len(exon_starts)): |
128 self.exons.append( GenomicRegionAlignment( exon_starts[i], exon_ends[i], species ) ) | 149 self.exons.append(GenomicRegionAlignment(exon_starts[i], exon_ends[i], species)) |
129 | 150 |
130 #returns the names for species found in alignment, skipping names as requested | 151 # returns the names for species found in alignment, skipping names as requested |
131 def get_species_names( self, skip = [] ): | 152 def get_species_names(self, skip=[]): |
132 if not isinstance( skip, list ): skip = [skip] | 153 if not isinstance(skip, list): |
154 skip = [skip] | |
133 names = [] | 155 names = [] |
134 for exon in self.exons: | 156 for exon in self.exons: |
135 for name in exon.get_species_names( skip = skip ): | 157 for name in exon.get_species_names(skip=skip): |
136 if name not in names: | 158 if name not in names: |
137 names.append( name ) | 159 names.append(name) |
138 return names | 160 return names |
139 | 161 |
140 #returns the sequence for a species | 162 # returns the sequence for a species |
141 def get_sequence( self, species ): | 163 def get_sequence(self, species): |
142 sequence = tempfile.TemporaryFile() | 164 sequence = tempfile.TemporaryFile() |
143 for exon in self.exons: | 165 for exon in self.exons: |
144 if species in exon.get_species_names(): | 166 if species in exon.get_species_names(): |
145 sequence.write( exon.get_sequence( species ) ) | 167 sequence.write(exon.get_sequence(species)) |
146 else: | 168 else: |
147 sequence.write( "-" * exon.size ) | 169 sequence.write("-" * exon.size) |
148 sequence.seek( 0 ) | 170 sequence.seek(0) |
149 return sequence.read() | 171 return sequence.read() |
150 | 172 |
151 #returns the reverse complement of the sequence for a species | 173 # returns the reverse complement of the sequence for a species |
152 def get_sequence_reverse_complement( self, species ): | 174 def get_sequence_reverse_complement(self, species): |
153 complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )] | 175 complement = [base for base in self.get_sequence(species).translate(self.DNA_COMPLEMENT)] |
154 complement.reverse() | 176 complement.reverse() |
155 return "".join( complement ) | 177 return "".join(complement) |
156 | 178 |
157 #Start and end of coding region | 179 # Start and end of coding region |
158 @property | 180 @property |
159 def start( self ): | 181 def start(self): |
160 return self.exons[0].start | 182 return self.exons[0].start |
183 | |
161 @property | 184 @property |
162 def end( self ): | 185 def end(self): |
163 return self.exons[-1].end | 186 return self.exons[-1].end |
164 | 187 |
165 #Open a MAF index using a UID | 188 # Open a MAF index using a UID |
166 def maf_index_by_uid( maf_uid, index_location_file ): | 189 |
167 for line in open( index_location_file ): | 190 |
191 def maf_index_by_uid(maf_uid, index_location_file): | |
192 for line in open(index_location_file): | |
168 try: | 193 try: |
169 #read each line, if not enough fields, go to next line | 194 # read each line, if not enough fields, go to next line |
170 if line[0:1] == "#" : continue | 195 if line[0:1] == "#": |
196 continue | |
171 fields = line.split('\t') | 197 fields = line.split('\t') |
172 if maf_uid == fields[1]: | 198 if maf_uid == fields[1]: |
173 try: | 199 try: |
174 maf_files = fields[4].replace( "\n", "" ).replace( "\r", "" ).split( "," ) | 200 maf_files = fields[4].replace("\n", "").replace("\r", "").split(",") |
175 return bx.align.maf.MultiIndexed( maf_files, keep_open = True, parse_e_rows = False ) | 201 return bx.align.maf.MultiIndexed(maf_files, keep_open=True, parse_e_rows=False) |
176 except Exception, e: | 202 except Exception as e: |
177 raise Exception( 'MAF UID (%s) found, but configuration appears to be malformed: %s' % ( maf_uid, e ) ) | 203 raise Exception('MAF UID (%s) found, but configuration appears to be malformed: %s' % (maf_uid, e)) |
178 except: | 204 except Exception: |
179 pass | 205 pass |
180 return None | 206 return None |
181 | 207 |
182 #return ( index, temp_index_filename ) for user maf, if available, or build one and return it, return None when no tempfile is created | 208 # return ( index, temp_index_filename ) for user maf, if available, or build one and return it, return None when no tempfile is created |
183 def open_or_build_maf_index( maf_file, index_filename, species = None ): | 209 |
210 | |
211 def open_or_build_maf_index(maf_file, index_filename, species=None): | |
184 try: | 212 try: |
185 return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), None ) | 213 return (bx.align.maf.Indexed(maf_file, index_filename=index_filename, keep_open=True, parse_e_rows=False), None) |
186 except: | 214 except Exception: |
187 return build_maf_index( maf_file, species = species ) | 215 return build_maf_index(maf_file, species=species) |
188 | 216 |
189 def build_maf_index_species_chromosomes( filename, index_species = None ): | 217 |
218 def build_maf_index_species_chromosomes(filename, index_species=None): | |
190 species = [] | 219 species = [] |
191 species_chromosomes = {} | 220 species_chromosomes = {} |
192 indexes = bx.interval_index_file.Indexes() | 221 indexes = bx.interval_index_file.Indexes() |
193 blocks = 0 | 222 blocks = 0 |
194 try: | 223 try: |
195 maf_reader = bx.align.maf.Reader( open( filename ) ) | 224 maf_reader = bx.align.maf.Reader(open(filename)) |
196 while True: | 225 while True: |
197 pos = maf_reader.file.tell() | 226 pos = maf_reader.file.tell() |
198 block = maf_reader.next() | 227 block = maf_reader.next() |
199 if block is None: | 228 if block is None: |
200 break | 229 break |
201 blocks += 1 | 230 blocks += 1 |
202 for c in block.components: | 231 for c in block.components: |
203 spec = c.src | 232 spec = c.src |
204 chrom = None | 233 chrom = None |
205 if "." in spec: | 234 if "." in spec: |
206 spec, chrom = spec.split( ".", 1 ) | 235 spec, chrom = spec.split(".", 1) |
207 if spec not in species: | 236 if spec not in species: |
208 species.append( spec ) | 237 species.append(spec) |
209 species_chromosomes[spec] = [] | 238 species_chromosomes[spec] = [] |
210 if chrom and chrom not in species_chromosomes[spec]: | 239 if chrom and chrom not in species_chromosomes[spec]: |
211 species_chromosomes[spec].append( chrom ) | 240 species_chromosomes[spec].append(chrom) |
212 if index_species is None or spec in index_species: | 241 if index_species is None or spec in index_species: |
213 forward_strand_start = c.forward_strand_start | 242 forward_strand_start = c.forward_strand_start |
214 forward_strand_end = c.forward_strand_end | 243 forward_strand_end = c.forward_strand_end |
215 try: | 244 try: |
216 forward_strand_start = int( forward_strand_start ) | 245 forward_strand_start = int(forward_strand_start) |
217 forward_strand_end = int( forward_strand_end ) | 246 forward_strand_end = int(forward_strand_end) |
218 except ValueError: | 247 except ValueError: |
219 continue #start and end are not integers, can't add component to index, goto next component | 248 continue # start and end are not integers, can't add component to index, goto next component |
220 #this likely only occurs when parse_e_rows is True? | 249 # this likely only occurs when parse_e_rows is True? |
221 #could a species exist as only e rows? should the | 250 # could a species exist as only e rows? should the |
222 if forward_strand_end > forward_strand_start: | 251 if forward_strand_end > forward_strand_start: |
223 #require positive length; i.e. certain lines have start = end = 0 and cannot be indexed | 252 # require positive length; i.e. certain lines have start = end = 0 and cannot be indexed |
224 indexes.add( c.src, forward_strand_start, forward_strand_end, pos, max=c.src_size ) | 253 indexes.add(c.src, forward_strand_start, forward_strand_end, pos, max=c.src_size) |
225 except Exception, e: | 254 except Exception as e: |
226 #most likely a bad MAF | 255 # most likely a bad MAF |
227 log.debug( 'Building MAF index on %s failed: %s' % ( filename, e ) ) | 256 log.debug('Building MAF index on %s failed: %s' % (filename, e)) |
228 return ( None, [], {}, 0 ) | 257 return (None, [], {}, 0) |
229 return ( indexes, species, species_chromosomes, blocks ) | 258 return (indexes, species, species_chromosomes, blocks) |
230 | 259 |
231 #builds and returns ( index, index_filename ) for specified maf_file | 260 # builds and returns ( index, index_filename ) for specified maf_file |
232 def build_maf_index( maf_file, species = None ): | 261 |
233 indexes, found_species, species_chromosomes, blocks = build_maf_index_species_chromosomes( maf_file, species ) | 262 |
263 def build_maf_index(maf_file, species=None): | |
264 indexes, found_species, species_chromosomes, blocks = build_maf_index_species_chromosomes(maf_file, species) | |
234 if indexes is not None: | 265 if indexes is not None: |
235 fd, index_filename = tempfile.mkstemp() | 266 fd, index_filename = tempfile.mkstemp() |
236 out = os.fdopen( fd, 'w' ) | 267 out = os.fdopen(fd, 'w') |
237 indexes.write( out ) | 268 indexes.write(out) |
238 out.close() | 269 out.close() |
239 return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), index_filename ) | 270 return (bx.align.maf.Indexed(maf_file, index_filename=index_filename, keep_open=True, parse_e_rows=False), index_filename) |
240 return ( None, None ) | 271 return (None, None) |
241 | 272 |
242 def component_overlaps_region( c, region ): | 273 |
243 if c is None: return False | 274 def component_overlaps_region(c, region): |
275 if c is None: | |
276 return False | |
244 start, end = c.get_forward_strand_start(), c.get_forward_strand_end() | 277 start, end = c.get_forward_strand_start(), c.get_forward_strand_end() |
245 if region.start >= end or region.end <= start: | 278 if region.start >= end or region.end <= start: |
246 return False | 279 return False |
247 return True | 280 return True |
248 | 281 |
249 def chop_block_by_region( block, src, region, species = None, mincols = 0 ): | 282 |
283 def chop_block_by_region(block, src, region, species=None, mincols=0): | |
250 # This chopping method was designed to maintain consistency with how start/end padding gaps have been working in Galaxy thus far: | 284 # This chopping method was designed to maintain consistency with how start/end padding gaps have been working in Galaxy thus far: |
251 # behavior as seen when forcing blocks to be '+' relative to src sequence (ref) and using block.slice_by_component( ref, slice_start, slice_end ) | 285 # behavior as seen when forcing blocks to be '+' relative to src sequence (ref) and using block.slice_by_component( ref, slice_start, slice_end ) |
252 # whether-or-not this is the 'correct' behavior is questionable, but this will at least maintain consistency | 286 # whether-or-not this is the 'correct' behavior is questionable, but this will at least maintain consistency |
253 # comments welcome | 287 # comments welcome |
254 slice_start = block.text_size #max for the min() | 288 slice_start = block.text_size # max for the min() |
255 slice_end = 0 #min for the max() | 289 slice_end = 0 # min for the max() |
256 old_score = block.score #save old score for later use | 290 old_score = block.score # save old score for later use |
257 # We no longer assume only one occurance of src per block, so we need to check them all | 291 # We no longer assume only one occurance of src per block, so we need to check them all |
258 for c in iter_components_by_src( block, src ): | 292 for c in iter_components_by_src(block, src): |
259 if component_overlaps_region( c, region ): | 293 if component_overlaps_region(c, region): |
260 if c.text is not None: | 294 if c.text is not None: |
261 rev_strand = False | 295 rev_strand = False |
262 if c.strand == "-": | 296 if c.strand == "-": |
263 #We want our coord_to_col coordinates to be returned from positive stranded component | 297 # We want our coord_to_col coordinates to be returned from positive stranded component |
264 rev_strand = True | 298 rev_strand = True |
265 c = c.reverse_complement() | 299 c = c.reverse_complement() |
266 start = max( region.start, c.start ) | 300 start = max(region.start, c.start) |
267 end = min( region.end, c.end ) | 301 end = min(region.end, c.end) |
268 start = c.coord_to_col( start ) | 302 start = c.coord_to_col(start) |
269 end = c.coord_to_col( end ) | 303 end = c.coord_to_col(end) |
270 if rev_strand: | 304 if rev_strand: |
271 #need to orient slice coordinates to the original block direction | 305 # need to orient slice coordinates to the original block direction |
272 slice_len = end - start | 306 slice_len = end - start |
273 end = len( c.text ) - start | 307 end = len(c.text) - start |
274 start = end - slice_len | 308 start = end - slice_len |
275 slice_start = min( start, slice_start ) | 309 slice_start = min(start, slice_start) |
276 slice_end = max( end, slice_end ) | 310 slice_end = max(end, slice_end) |
277 | 311 |
278 if slice_start < slice_end: | 312 if slice_start < slice_end: |
279 block = block.slice( slice_start, slice_end ) | 313 block = block.slice(slice_start, slice_end) |
280 if block.text_size > mincols: | 314 if block.text_size > mincols: |
281 # restore old score, may not be accurate, but it is better than 0 for everything? | 315 # restore old score, may not be accurate, but it is better than 0 for everything? |
282 block.score = old_score | 316 block.score = old_score |
283 if species is not None: | 317 if species is not None: |
284 block = block.limit_to_species( species ) | 318 block = block.limit_to_species(species) |
285 block.remove_all_gap_columns() | 319 block.remove_all_gap_columns() |
286 return block | 320 return block |
287 return None | 321 return None |
288 | 322 |
289 def orient_block_by_region( block, src, region, force_strand = None ): | 323 |
290 #loop through components matching src, | 324 def orient_block_by_region(block, src, region, force_strand=None): |
291 #make sure each of these components overlap region | 325 # loop through components matching src, |
292 #cache strand for each of overlaping regions | 326 # make sure each of these components overlap region |
293 #if force_strand / region.strand not in strand cache, reverse complement | 327 # cache strand for each of overlaping regions |
294 ### we could have 2 sequences with same src, overlapping region, on different strands, this would cause no reverse_complementing | 328 # if force_strand / region.strand not in strand cache, reverse complement |
295 strands = [ c.strand for c in iter_components_by_src( block, src ) if component_overlaps_region( c, region ) ] | 329 # we could have 2 sequences with same src, overlapping region, on different strands, this would cause no reverse_complementing |
296 if strands and ( force_strand is None and region.strand not in strands ) or ( force_strand is not None and force_strand not in strands ): | 330 strands = [c.strand for c in iter_components_by_src(block, src) if component_overlaps_region(c, region)] |
331 if strands and (force_strand is None and region.strand not in strands) or (force_strand is not None and force_strand not in strands): | |
297 block = block.reverse_complement() | 332 block = block.reverse_complement() |
298 return block | 333 return block |
299 | 334 |
300 def get_oriented_chopped_blocks_for_region( index, src, region, species = None, mincols = 0, force_strand = None ): | 335 |
301 for block, idx, offset in get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols, force_strand ): | 336 def get_oriented_chopped_blocks_for_region(index, src, region, species=None, mincols=0, force_strand=None): |
337 for block, idx, offset in get_oriented_chopped_blocks_with_index_offset_for_region(index, src, region, species, mincols, force_strand): | |
302 yield block | 338 yield block |
303 def get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0, force_strand = None ): | 339 |
304 for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ): | 340 |
305 yield orient_block_by_region( block, src, region, force_strand ), idx, offset | 341 def get_oriented_chopped_blocks_with_index_offset_for_region(index, src, region, species=None, mincols=0, force_strand=None): |
306 | 342 for block, idx, offset in get_chopped_blocks_with_index_offset_for_region(index, src, region, species, mincols): |
307 #split a block with multiple occurances of src into one block per src | 343 yield orient_block_by_region(block, src, region, force_strand), idx, offset |
308 def iter_blocks_split_by_src( block, src ): | 344 |
309 for src_c in iter_components_by_src( block, src ): | 345 # split a block with multiple occurances of src into one block per src |
310 new_block = bx.align.Alignment( score=block.score, attributes=deepcopy( block.attributes ) ) | 346 |
347 | |
348 def iter_blocks_split_by_src(block, src): | |
349 for src_c in iter_components_by_src(block, src): | |
350 new_block = bx.align.Alignment(score=block.score, attributes=deepcopy(block.attributes)) | |
311 new_block.text_size = block.text_size | 351 new_block.text_size = block.text_size |
312 for c in block.components: | 352 for c in block.components: |
313 if c == src_c or c.src != src: | 353 if c == src_c or c.src != src: |
314 new_block.add_component( deepcopy( c ) ) #components have reference to alignment, dont want to loose reference to original alignment block in original components | 354 new_block.add_component(deepcopy(c)) # components have reference to alignment, dont want to loose reference to original alignment block in original components |
315 yield new_block | 355 yield new_block |
316 | 356 |
317 #split a block into multiple blocks with all combinations of a species appearing only once per block | 357 # split a block into multiple blocks with all combinations of a species appearing only once per block |
318 def iter_blocks_split_by_species( block, species = None ): | 358 |
319 def __split_components_by_species( components_by_species, new_block ): | 359 |
360 def iter_blocks_split_by_species(block, species=None): | |
361 def __split_components_by_species(components_by_species, new_block): | |
320 if components_by_species: | 362 if components_by_species: |
321 #more species with components to add to this block | 363 # more species with components to add to this block |
322 components_by_species = deepcopy( components_by_species ) | 364 components_by_species = deepcopy(components_by_species) |
323 spec_comps = components_by_species.pop( 0 ) | 365 spec_comps = components_by_species.pop(0) |
324 for c in spec_comps: | 366 for c in spec_comps: |
325 newer_block = deepcopy( new_block ) | 367 newer_block = deepcopy(new_block) |
326 newer_block.add_component( deepcopy( c ) ) | 368 newer_block.add_component(deepcopy(c)) |
327 for value in __split_components_by_species( components_by_species, newer_block ): | 369 for value in __split_components_by_species(components_by_species, newer_block): |
328 yield value | 370 yield value |
329 else: | 371 else: |
330 #no more components to add, yield this block | 372 # no more components to add, yield this block |
331 yield new_block | 373 yield new_block |
332 | 374 |
333 #divide components by species | 375 # divide components by species |
334 spec_dict = {} | 376 spec_dict = {} |
335 if not species: | 377 if not species: |
336 species = [] | 378 species = [] |
337 for c in block.components: | 379 for c in block.components: |
338 spec, chrom = src_split( c.src ) | 380 spec, chrom = src_split(c.src) |
339 if spec not in spec_dict: | 381 if spec not in spec_dict: |
340 spec_dict[ spec ] = [] | 382 spec_dict[spec] = [] |
341 species.append( spec ) | 383 species.append(spec) |
342 spec_dict[ spec ].append( c ) | 384 spec_dict[spec].append(c) |
343 else: | 385 else: |
344 for spec in species: | 386 for spec in species: |
345 spec_dict[ spec ] = [] | 387 spec_dict[spec] = [] |
346 for c in iter_components_by_src_start( block, spec ): | 388 for c in iter_components_by_src_start(block, spec): |
347 spec_dict[ spec ].append( c ) | 389 spec_dict[spec].append(c) |
348 | 390 |
349 empty_block = bx.align.Alignment( score=block.score, attributes=deepcopy( block.attributes ) ) #should we copy attributes? | 391 empty_block = bx.align.Alignment(score=block.score, attributes=deepcopy(block.attributes)) # should we copy attributes? |
350 empty_block.text_size = block.text_size | 392 empty_block.text_size = block.text_size |
351 #call recursive function to split into each combo of spec/blocks | 393 # call recursive function to split into each combo of spec/blocks |
352 for value in __split_components_by_species( spec_dict.values(), empty_block ): | 394 for value in __split_components_by_species(spec_dict.values(), empty_block): |
353 sort_block_components_by_block( value, block ) #restore original component order | 395 sort_block_components_by_block(value, block) # restore original component order |
354 yield value | 396 yield value |
355 | 397 |
356 | 398 |
357 #generator yielding only chopped and valid blocks for a specified region | 399 # generator yielding only chopped and valid blocks for a specified region |
358 def get_chopped_blocks_for_region( index, src, region, species = None, mincols = 0 ): | 400 def get_chopped_blocks_for_region(index, src, region, species=None, mincols=0): |
359 for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ): | 401 for block, idx, offset in get_chopped_blocks_with_index_offset_for_region(index, src, region, species, mincols): |
360 yield block | 402 yield block |
361 def get_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0 ): | 403 |
362 for block, idx, offset in index.get_as_iterator_with_index_and_offset( src, region.start, region.end ): | 404 |
363 block = chop_block_by_region( block, src, region, species, mincols ) | 405 def get_chopped_blocks_with_index_offset_for_region(index, src, region, species=None, mincols=0): |
406 for block, idx, offset in index.get_as_iterator_with_index_and_offset(src, region.start, region.end): | |
407 block = chop_block_by_region(block, src, region, species, mincols) | |
364 if block is not None: | 408 if block is not None: |
365 yield block, idx, offset | 409 yield block, idx, offset |
366 | 410 |
367 #returns a filled region alignment for specified regions | 411 # returns a filled region alignment for specified regions |
368 def get_region_alignment( index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ): | 412 |
369 if species is not None: alignment = RegionAlignment( end - start, species ) | 413 |
370 else: alignment = RegionAlignment( end - start, primary_species ) | 414 def get_region_alignment(index, primary_species, chrom, start, end, strand='+', species=None, mincols=0, overwrite_with_gaps=True): |
371 return fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand, species, mincols, overwrite_with_gaps ) | 415 if species is not None: |
372 | 416 alignment = RegionAlignment(end - start, species) |
373 #reduces a block to only positions exisiting in the src provided | 417 else: |
374 def reduce_block_by_primary_genome( block, species, chromosome, region_start ): | 418 alignment = RegionAlignment(end - start, primary_species) |
375 #returns ( startIndex, {species:texts} | 419 return fill_region_alignment(alignment, index, primary_species, chrom, start, end, strand, species, mincols, overwrite_with_gaps) |
376 #where texts' contents are reduced to only positions existing in the primary genome | 420 |
377 src = "%s.%s" % ( species, chromosome ) | 421 # reduces a block to only positions exisiting in the src provided |
378 ref = block.get_component_by_src( src ) | 422 |
423 | |
424 def reduce_block_by_primary_genome(block, species, chromosome, region_start): | |
425 # returns ( startIndex, {species:texts} | |
426 # where texts' contents are reduced to only positions existing in the primary genome | |
427 src = "%s.%s" % (species, chromosome) | |
428 ref = block.get_component_by_src(src) | |
379 start_offset = ref.start - region_start | 429 start_offset = ref.start - region_start |
380 species_texts = {} | 430 species_texts = {} |
381 for c in block.components: | 431 for c in block.components: |
382 species_texts[ c.src.split( '.' )[0] ] = list( c.text ) | 432 species_texts[c.src.split('.')[0]] = list(c.text) |
383 #remove locations which are gaps in the primary species, starting from the downstream end | 433 # remove locations which are gaps in the primary species, starting from the downstream end |
384 for i in range( len( species_texts[ species ] ) - 1, -1, -1 ): | 434 for i in range(len(species_texts[species]) - 1, -1, -1): |
385 if species_texts[ species ][i] == '-': | 435 if species_texts[species][i] == '-': |
386 for text in species_texts.values(): | 436 for text in species_texts.values(): |
387 text.pop( i ) | 437 text.pop(i) |
388 for spec, text in species_texts.items(): | 438 for spec, text in species_texts.items(): |
389 species_texts[spec] = ''.join( text ) | 439 species_texts[spec] = ''.join(text) |
390 return ( start_offset, species_texts ) | 440 return (start_offset, species_texts) |
391 | 441 |
392 #fills a region alignment | 442 # fills a region alignment |
393 def fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ): | 443 |
394 region = bx.intervals.Interval( start, end ) | 444 |
445 def fill_region_alignment(alignment, index, primary_species, chrom, start, end, strand='+', species=None, mincols=0, overwrite_with_gaps=True): | |
446 region = bx.intervals.Interval(start, end) | |
395 region.chrom = chrom | 447 region.chrom = chrom |
396 region.strand = strand | 448 region.strand = strand |
397 primary_src = "%s.%s" % ( primary_species, chrom ) | 449 primary_src = "%s.%s" % (primary_species, chrom) |
398 | 450 |
399 #Order blocks overlaping this position by score, lowest first | 451 # Order blocks overlaping this position by score, lowest first |
400 blocks = [] | 452 blocks = [] |
401 for block, idx, offset in index.get_as_iterator_with_index_and_offset( primary_src, start, end ): | 453 for block, idx, offset in index.get_as_iterator_with_index_and_offset(primary_src, start, end): |
402 score = float( block.score ) | 454 score = float(block.score) |
403 for i in range( 0, len( blocks ) ): | 455 for i in range(0, len(blocks)): |
404 if score < blocks[i][0]: | 456 if score < blocks[i][0]: |
405 blocks.insert( i, ( score, idx, offset ) ) | 457 blocks.insert(i, (score, idx, offset)) |
406 break | 458 break |
407 else: | 459 else: |
408 blocks.append( ( score, idx, offset ) ) | 460 blocks.append((score, idx, offset)) |
409 | 461 |
410 #gap_chars_tuple = tuple( GAP_CHARS ) | 462 # gap_chars_tuple = tuple( GAP_CHARS ) |
411 gap_chars_str = ''.join( GAP_CHARS ) | 463 gap_chars_str = ''.join(GAP_CHARS) |
412 #Loop through ordered blocks and layer by increasing score | 464 # Loop through ordered blocks and layer by increasing score |
413 for block_dict in blocks: | 465 for block_dict in blocks: |
414 for block in iter_blocks_split_by_species( block_dict[1].get_at_offset( block_dict[2] ) ): #need to handle each occurance of sequence in block seperately | 466 for block in iter_blocks_split_by_species(block_dict[1].get_at_offset(block_dict[2])): # need to handle each occurance of sequence in block seperately |
415 if component_overlaps_region( block.get_component_by_src( primary_src ), region ): | 467 if component_overlaps_region(block.get_component_by_src(primary_src), region): |
416 block = chop_block_by_region( block, primary_src, region, species, mincols ) #chop block | 468 block = chop_block_by_region(block, primary_src, region, species, mincols) # chop block |
417 block = orient_block_by_region( block, primary_src, region ) #orient block | 469 block = orient_block_by_region(block, primary_src, region) # orient block |
418 start_offset, species_texts = reduce_block_by_primary_genome( block, primary_species, chrom, start ) | 470 start_offset, species_texts = reduce_block_by_primary_genome(block, primary_species, chrom, start) |
419 for spec, text in species_texts.items(): | 471 for spec, text in species_texts.items(): |
420 #we should trim gaps from both sides, since these are not positions in this species genome (sequence) | 472 # we should trim gaps from both sides, since these are not positions in this species genome (sequence) |
421 text = text.rstrip( gap_chars_str ) | 473 text = text.rstrip(gap_chars_str) |
422 gap_offset = 0 | 474 gap_offset = 0 |
423 while True in [ text.startswith( gap_char ) for gap_char in GAP_CHARS ]: #python2.4 doesn't accept a tuple for .startswith() | 475 while True in [text.startswith(gap_char) for gap_char in GAP_CHARS]: # python2.4 doesn't accept a tuple for .startswith() |
424 #while text.startswith( gap_chars_tuple ): | 476 # while text.startswith( gap_chars_tuple ): |
425 gap_offset += 1 | 477 gap_offset += 1 |
426 text = text[1:] | 478 text = text[1:] |
427 if not text: | 479 if not text: |
428 break | 480 break |
429 if text: | 481 if text: |
430 if overwrite_with_gaps: | 482 if overwrite_with_gaps: |
431 alignment.set_range( start_offset + gap_offset, spec, text ) | 483 alignment.set_range(start_offset + gap_offset, spec, text) |
432 else: | 484 else: |
433 for i, char in enumerate( text ): | 485 for i, char in enumerate(text): |
434 if char not in GAP_CHARS: | 486 if char not in GAP_CHARS: |
435 alignment.set_position( start_offset + gap_offset + i, spec, char ) | 487 alignment.set_position(start_offset + gap_offset + i, spec, char) |
436 return alignment | 488 return alignment |
437 | 489 |
438 #returns a filled spliced region alignment for specified region with start and end lists | 490 # returns a filled spliced region alignment for specified region with start and end lists |
439 def get_spliced_region_alignment( index, primary_species, chrom, starts, ends, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ): | 491 |
440 #create spliced alignment object | 492 |
441 if species is not None: alignment = SplicedAlignment( starts, ends, species ) | 493 def get_spliced_region_alignment(index, primary_species, chrom, starts, ends, strand='+', species=None, mincols=0, overwrite_with_gaps=True): |
442 else: alignment = SplicedAlignment( starts, ends, [primary_species] ) | 494 # create spliced alignment object |
495 if species is not None: | |
496 alignment = SplicedAlignment(starts, ends, species) | |
497 else: | |
498 alignment = SplicedAlignment(starts, ends, [primary_species]) | |
443 for exon in alignment.exons: | 499 for exon in alignment.exons: |
444 fill_region_alignment( exon, index, primary_species, chrom, exon.start, exon.end, strand, species, mincols, overwrite_with_gaps ) | 500 fill_region_alignment(exon, index, primary_species, chrom, exon.start, exon.end, strand, species, mincols, overwrite_with_gaps) |
445 return alignment | 501 return alignment |
446 | 502 |
447 #loop through string array, only return non-commented lines | 503 # loop through string array, only return non-commented lines |
448 def line_enumerator( lines, comment_start = '#' ): | 504 |
505 | |
506 def line_enumerator(lines, comment_start='#'): | |
449 i = 0 | 507 i = 0 |
450 for line in lines: | 508 for line in lines: |
451 if not line.startswith( comment_start ): | 509 if not line.startswith(comment_start): |
452 i += 1 | 510 i += 1 |
453 yield ( i, line ) | 511 yield (i, line) |
454 | 512 |
455 #read a GeneBed file, return list of starts, ends, raw fields | 513 # read a GeneBed file, return list of starts, ends, raw fields |
456 def get_starts_ends_fields_from_gene_bed( line ): | 514 |
457 #Starts and ends for exons | 515 |
516 def get_starts_ends_fields_from_gene_bed(line): | |
517 # Starts and ends for exons | |
458 starts = [] | 518 starts = [] |
459 ends = [] | 519 ends = [] |
460 | 520 |
461 fields = line.split() | 521 fields = line.split() |
462 #Requires atleast 12 BED columns | 522 # Requires atleast 12 BED columns |
463 if len(fields) < 12: | 523 if len(fields) < 12: |
464 raise Exception( "Not a proper 12 column BED line (%s)." % line ) | 524 raise Exception("Not a proper 12 column BED line (%s)." % line) |
465 chrom = fields[0] | 525 tx_start = int(fields[1]) |
466 tx_start = int( fields[1] ) | 526 strand = fields[5] |
467 tx_end = int( fields[2] ) | 527 if strand != '-': |
468 name = fields[3] | 528 strand = '+' # Default strand is + |
469 strand = fields[5] | 529 cds_start = int(fields[6]) |
470 if strand != '-': strand='+' #Default strand is + | 530 cds_end = int(fields[7]) |
471 cds_start = int( fields[6] ) | 531 |
472 cds_end = int( fields[7] ) | 532 # Calculate and store starts and ends of coding exons |
473 | |
474 #Calculate and store starts and ends of coding exons | |
475 region_start, region_end = cds_start, cds_end | 533 region_start, region_end = cds_start, cds_end |
476 exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) ) | 534 exon_starts = map(int, fields[11].rstrip(',\n').split(',')) |
477 exon_starts = map( ( lambda x: x + tx_start ), exon_starts ) | 535 exon_starts = map((lambda x: x + tx_start), exon_starts) |
478 exon_ends = map( int, fields[10].rstrip( ',' ).split( ',' ) ) | 536 exon_ends = map(int, fields[10].rstrip(',').split(',')) |
479 exon_ends = map( ( lambda x, y: x + y ), exon_starts, exon_ends ); | 537 exon_ends = map((lambda x, y: x + y), exon_starts, exon_ends) |
480 for start, end in zip( exon_starts, exon_ends ): | 538 for start, end in zip(exon_starts, exon_ends): |
481 start = max( start, region_start ) | 539 start = max(start, region_start) |
482 end = min( end, region_end ) | 540 end = min(end, region_end) |
483 if start < end: | 541 if start < end: |
484 starts.append( start ) | 542 starts.append(start) |
485 ends.append( end ) | 543 ends.append(end) |
486 return ( starts, ends, fields ) | 544 return (starts, ends, fields) |
487 | 545 |
488 def iter_components_by_src( block, src ): | 546 |
547 def iter_components_by_src(block, src): | |
489 for c in block.components: | 548 for c in block.components: |
490 if c.src == src: | 549 if c.src == src: |
491 yield c | 550 yield c |
492 | 551 |
493 def get_components_by_src( block, src ): | 552 |
494 return [ value for value in iter_components_by_src( block, src ) ] | 553 def get_components_by_src(block, src): |
495 | 554 return [value for value in iter_components_by_src(block, src)] |
496 def iter_components_by_src_start( block, src ): | 555 |
556 | |
557 def iter_components_by_src_start(block, src): | |
497 for c in block.components: | 558 for c in block.components: |
498 if c.src.startswith( src ): | 559 if c.src.startswith(src): |
499 yield c | 560 yield c |
500 | 561 |
501 def get_components_by_src_start( block, src ): | 562 |
502 return [ value for value in iter_components_by_src_start( block, src ) ] | 563 def get_components_by_src_start(block, src): |
503 | 564 return [value for value in iter_components_by_src_start(block, src)] |
504 def sort_block_components_by_block( block1, block2 ): | 565 |
505 #orders the components in block1 by the index of the component in block2 | 566 |
506 #block1 must be a subset of block2 | 567 def sort_block_components_by_block(block1, block2): |
507 #occurs in-place | 568 # orders the components in block1 by the index of the component in block2 |
508 return block1.components.sort( cmp = lambda x, y: block2.components.index( x ) - block2.components.index( y ) ) | 569 # block1 must be a subset of block2 |
509 | 570 # occurs in-place |
510 def get_species_in_maf( maf_filename ): | 571 return block1.components.sort(cmp=lambda x, y: block2.components.index(x) - block2.components.index(y)) |
572 | |
573 | |
574 def get_species_in_maf(maf_filename): | |
511 species = [] | 575 species = [] |
512 for block in bx.align.maf.Reader( open( maf_filename ) ): | 576 for block in bx.align.maf.Reader(open(maf_filename)): |
513 for spec in get_species_in_block( block ): | 577 for spec in get_species_in_block(block): |
514 if spec not in species: | 578 if spec not in species: |
515 species.append( spec ) | 579 species.append(spec) |
516 return species | 580 return species |
517 | 581 |
518 def parse_species_option( species ): | 582 |
583 def parse_species_option(species): | |
519 if species: | 584 if species: |
520 species = species.split( ',' ) | 585 species = species.split(',') |
521 if 'None' not in species: | 586 if 'None' not in species: |
522 return species | 587 return species |
523 return None #provided species was '', None, or had 'None' in it | 588 return None # provided species was '', None, or had 'None' in it |
524 | 589 |
525 def remove_temp_index_file( index_filename ): | 590 |
526 try: os.unlink( index_filename ) | 591 def remove_temp_index_file(index_filename): |
527 except: pass | 592 try: |
528 | 593 os.unlink(index_filename) |
529 #Below are methods to deal with FASTA files | 594 except Exception: |
530 | 595 pass |
531 def get_fasta_header( component, attributes = {}, suffix = None ): | 596 |
532 header = ">%s(%s):%i-%i|" % ( component.src, component.strand, component.get_forward_strand_start(), component.get_forward_strand_end() ) | 597 # Below are methods to deal with FASTA files |
598 | |
599 | |
600 def get_fasta_header(component, attributes={}, suffix=None): | |
601 header = ">%s(%s):%i-%i|" % (component.src, component.strand, component.get_forward_strand_start(), component.get_forward_strand_end()) | |
533 for key, value in attributes.iteritems(): | 602 for key, value in attributes.iteritems(): |
534 header = "%s%s=%s|" % ( header, key, value ) | 603 header = "%s%s=%s|" % (header, key, value) |
535 if suffix: | 604 if suffix: |
536 header = "%s%s" % ( header, suffix ) | 605 header = "%s%s" % (header, suffix) |
537 else: | 606 else: |
538 header = "%s%s" % ( header, src_split( component.src )[ 0 ] ) | 607 header = "%s%s" % (header, src_split(component.src)[0]) |
539 return header | 608 return header |
540 | 609 |
541 def get_attributes_from_fasta_header( header ): | 610 |
542 if not header: return {} | 611 def get_attributes_from_fasta_header(header): |
612 if not header: | |
613 return {} | |
543 attributes = {} | 614 attributes = {} |
544 header = header.lstrip( '>' ) | 615 header = header.lstrip('>') |
545 header = header.strip() | 616 header = header.strip() |
546 fields = header.split( '|' ) | 617 fields = header.split('|') |
547 try: | 618 try: |
548 region = fields[0] | 619 region = fields[0] |
549 region = region.split( '(', 1 ) | 620 region = region.split('(', 1) |
550 temp = region[0].split( '.', 1 ) | 621 temp = region[0].split('.', 1) |
551 attributes['species'] = temp[0] | 622 attributes['species'] = temp[0] |
552 if len( temp ) == 2: | 623 if len(temp) == 2: |
553 attributes['chrom'] = temp[1] | 624 attributes['chrom'] = temp[1] |
554 else: | 625 else: |
555 attributes['chrom'] = temp[0] | 626 attributes['chrom'] = temp[0] |
556 region = region[1].split( ')', 1 ) | 627 region = region[1].split(')', 1) |
557 attributes['strand'] = region[0] | 628 attributes['strand'] = region[0] |
558 region = region[1].lstrip( ':' ).split( '-' ) | 629 region = region[1].lstrip(':').split('-') |
559 attributes['start'] = int( region[0] ) | 630 attributes['start'] = int(region[0]) |
560 attributes['end'] = int( region[1] ) | 631 attributes['end'] = int(region[1]) |
561 except: | 632 except Exception: |
562 #fields 0 is not a region coordinate | 633 # fields 0 is not a region coordinate |
563 pass | 634 pass |
564 if len( fields ) > 2: | 635 if len(fields) > 2: |
565 for i in xrange( 1, len( fields ) - 1 ): | 636 for i in range(1, len(fields) - 1): |
566 prop = fields[i].split( '=', 1 ) | 637 prop = fields[i].split('=', 1) |
567 if len( prop ) == 2: | 638 if len(prop) == 2: |
568 attributes[ prop[0] ] = prop[1] | 639 attributes[prop[0]] = prop[1] |
569 if len( fields ) > 1: | 640 if len(fields) > 1: |
570 attributes['__suffix__'] = fields[-1] | 641 attributes['__suffix__'] = fields[-1] |
571 return attributes | 642 return attributes |
572 | 643 |
573 def iter_fasta_alignment( filename ): | 644 |
645 def iter_fasta_alignment(filename): | |
574 class fastaComponent: | 646 class fastaComponent: |
575 def __init__( self, species, text = "" ): | 647 def __init__(self, species, text=""): |
576 self.species = species | 648 self.species = species |
577 self.text = text | 649 self.text = text |
578 def extend( self, text ): | 650 |
579 self.text = self.text + text.replace( '\n', '' ).replace( '\r', '' ).strip() | 651 def extend(self, text): |
580 #yields a list of fastaComponents for a FASTA file | 652 self.text = self.text + text.replace('\n', '').replace('\r', '').strip() |
581 f = open( filename, 'rb' ) | 653 # yields a list of fastaComponents for a FASTA file |
582 components = [] | 654 with open(filename, 'r') as f: |
583 #cur_component = None | 655 components = [] |
584 while True: | 656 # cur_component = None |
585 line = f.readline() | 657 while True: |
586 if not line: | 658 line = f.readline() |
587 if components: | 659 if not line: |
588 yield components | 660 if components: |
589 return | 661 yield components |
590 line = line.strip() | 662 return |
591 if not line: | 663 line = line.strip() |
592 if components: | 664 if not line: |
593 yield components | 665 if components: |
594 components = [] | 666 yield components |
595 elif line.startswith( '>' ): | 667 components = [] |
596 attributes = get_attributes_from_fasta_header( line ) | 668 elif line.startswith('>'): |
597 components.append( fastaComponent( attributes['species'] ) ) | 669 attributes = get_attributes_from_fasta_header(line) |
598 elif components: | 670 components.append(fastaComponent(attributes['species'])) |
599 components[-1].extend( line ) | 671 elif components: |
600 | 672 components[-1].extend(line) |