Mercurial > repos > iuc > extract_genomic_dna
comparison extract_genomic_dna_utils.py @ 0:8dd8e89c0603 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/extract_genomic_dna commit b'67cff25a50ba173b0468819204d0999496f68ea9'
author | iuc |
---|---|
date | Tue, 19 Jan 2016 09:34:23 -0500 |
parents | |
children | 702970e4a134 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:8dd8e89c0603 |
---|---|
1 import copy | |
2 import os | |
3 import subprocess | |
4 import sys | |
5 import tempfile | |
6 | |
7 from bx.intervals.io import Comment, Header, GenomicInterval | |
8 from bx.intervals.io import GenomicIntervalReader, NiceReaderWrapper, ParseError | |
9 | |
10 # Default chrom, start, end, strand cols for a bed file | |
11 BED_DEFAULT_COLS = 0, 1, 2, 5 | |
12 | |
13 | |
14 class GFFInterval(GenomicInterval): | |
15 """ | |
16 A GFF interval, including attributes. If file is strictly a GFF file, | |
17 only attribute is 'group.' | |
18 """ | |
19 | |
20 def __init__(self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4, | |
21 strand_col=6, score_col=5, default_strand='.', fix_strand=False): | |
22 # GFF format allows '.' for strand but GenomicInterval does not. To get around this, | |
23 # temporarily set strand and then unset after initing GenomicInterval. | |
24 unknown_strand = False | |
25 if not fix_strand and fields[strand_col] == '.': | |
26 unknown_strand = True | |
27 fields[strand_col] = '+' | |
28 GenomicInterval.__init__(self, reader, fields, chrom_col, start_col, end_col, | |
29 strand_col, default_strand, fix_strand=fix_strand) | |
30 if unknown_strand: | |
31 self.strand = '.' | |
32 self.fields[strand_col] = '.' | |
33 # Handle feature, score column. | |
34 self.feature_col = feature_col | |
35 if self.feature_col >= self.nfields: | |
36 stop_err("No field for feature_col (%d)" % feature_col) | |
37 self.feature = self.fields[self.feature_col] | |
38 self.score_col = score_col | |
39 if self.score_col >= self.nfields: | |
40 stop_err("No field for score_col (%d)" % score_col) | |
41 self.score = self.fields[self.score_col] | |
42 # GFF attributes. | |
43 self.attributes = parse_gff_attributes(fields[8]) | |
44 | |
45 def copy(self): | |
46 return GFFInterval(self.reader, list(self.fields), self.chrom_col, self.feature_col, | |
47 self.start_col, self.end_col, self.strand_col, self.score_col, self.strand) | |
48 | |
49 | |
50 class GFFFeature(GFFInterval): | |
51 """ | |
52 A GFF feature, which can include multiple intervals. | |
53 """ | |
54 | |
55 def __init__(self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, strand_col=6, | |
56 score_col=5, default_strand='.', fix_strand=False, intervals=[], raw_size=0): | |
57 # Use copy so that first interval and feature do not share fields. | |
58 GFFInterval.__init__(self, reader, copy.deepcopy(intervals[0].fields), chrom_col, feature_col, | |
59 start_col, end_col, strand_col, score_col, default_strand, fix_strand=fix_strand) | |
60 self.intervals = intervals | |
61 self.raw_size = raw_size | |
62 # Use intervals to set feature attributes. | |
63 for interval in self.intervals: | |
64 # Error checking. NOTE: intervals need not share the same strand. | |
65 if interval.chrom != self.chrom: | |
66 stop_err("interval chrom does not match self chrom: %s != %s" % (interval.chrom, self.chrom)) | |
67 # Set start, end of interval. | |
68 if interval.start < self.start: | |
69 self.start = interval.start | |
70 if interval.end > self.end: | |
71 self.end = interval.end | |
72 | |
73 def name(self): | |
74 """ | |
75 Returns feature's name. | |
76 """ | |
77 name = None | |
78 # Preference for name: | |
79 # GTF: 'gene_id', 'transcript_id' | |
80 # GFF3: 'ID', 'id' | |
81 # GFF: 'group' | |
82 for attr_name in ['gene_id', 'transcript_id', 'ID', 'id', 'group']: | |
83 name = self.attributes.get(attr_name, None) | |
84 if name is not None: | |
85 break | |
86 return name | |
87 | |
88 def copy(self): | |
89 intervals_copy = [] | |
90 for interval in self.intervals: | |
91 intervals_copy.append(interval.copy()) | |
92 return GFFFeature(self.reader, self.chrom_col, self.feature_col, self.start_col, self.end_col, | |
93 self.strand_col, self.score_col, self.strand, intervals=intervals_copy) | |
94 | |
95 def lines(self): | |
96 lines = [] | |
97 for interval in self.intervals: | |
98 lines.append('\t'.join(interval.fields)) | |
99 return lines | |
100 | |
101 | |
102 class GFFReaderWrapper(NiceReaderWrapper): | |
103 """ | |
104 Reader wrapper for GFF files which has two major functions: | |
105 1. group entries for GFF file (via group column), GFF3 (via id attribute), | |
106 or GTF (via gene_id/transcript id); | |
107 2. convert coordinates from GFF format--starting and ending coordinates | |
108 are 1-based, closed--to the 'traditional'/BED interval format--0 based, | |
109 half-open. This is useful when using GFF files as inputs to tools that | |
110 expect traditional interval format. | |
111 """ | |
112 | |
113 def __init__(self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, strand_col=6, | |
114 score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs): | |
115 NiceReaderWrapper.__init__(self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col, | |
116 strand_col=strand_col, fix_strand=fix_strand, **kwargs) | |
117 self.feature_col = feature_col | |
118 self.score_col = score_col | |
119 self.convert_to_bed_coord = convert_to_bed_coord | |
120 self.last_line = None | |
121 self.cur_offset = 0 | |
122 self.seed_interval = None | |
123 self.seed_interval_line_len = 0 | |
124 | |
125 def parse_row(self, line): | |
126 interval = GFFInterval(self, line.split("\t"), self.chrom_col, self.feature_col, self.start_col, | |
127 self.end_col, self.strand_col, self.score_col, self.default_strand, | |
128 fix_strand=self.fix_strand) | |
129 return interval | |
130 | |
131 def next(self): | |
132 """ | |
133 Returns next GFFFeature. | |
134 """ | |
135 | |
136 def handle_parse_error(parse_error): | |
137 """ | |
138 Actions to take when ParseError found. | |
139 """ | |
140 if self.outstream: | |
141 if self.print_delegate and hasattr(self.print_delegate, "__call__"): | |
142 self.print_delegate(self.outstream, e, self) | |
143 self.skipped += 1 | |
144 # No reason to stuff an entire bad file into memory. | |
145 if self.skipped < 10: | |
146 self.skipped_lines.append((self.linenum, self.current_line, str(e))) | |
147 # Get next GFFFeature | |
148 raw_size = self.seed_interval_line_len | |
149 # If there is no seed interval, set one. Also, if there are no more | |
150 # intervals to read, this is where iterator dies. | |
151 if not self.seed_interval: | |
152 while not self.seed_interval: | |
153 try: | |
154 self.seed_interval = GenomicIntervalReader.next(self) | |
155 except ParseError as e: | |
156 handle_parse_error(e) | |
157 finally: | |
158 raw_size += len(self.current_line) | |
159 # If header or comment, clear seed interval and return it with its size. | |
160 if isinstance(self.seed_interval, (Header, Comment)): | |
161 return_val = self.seed_interval | |
162 return_val.raw_size = len(self.current_line) | |
163 self.seed_interval = None | |
164 self.seed_interval_line_len = 0 | |
165 return return_val | |
166 # Initialize feature identifier from seed. | |
167 # For GFF. | |
168 feature_group = self.seed_interval.attributes.get('group', None) | |
169 # For GFF3 | |
170 feature_id = self.seed_interval.attributes.get('ID', None) | |
171 # For GTF. | |
172 feature_transcript_id = self.seed_interval.attributes.get('transcript_id', None) | |
173 # Read all intervals associated with seed. | |
174 feature_intervals = [] | |
175 feature_intervals.append(self.seed_interval) | |
176 while True: | |
177 try: | |
178 interval = GenomicIntervalReader.next(self) | |
179 raw_size += len(self.current_line) | |
180 except StopIteration as e: | |
181 # No more intervals to read, but last feature needs to be | |
182 # returned. | |
183 interval = None | |
184 raw_size += len(self.current_line) | |
185 break | |
186 except ParseError as e: | |
187 handle_parse_error(e) | |
188 raw_size += len(self.current_line) | |
189 continue | |
190 # Ignore comments. | |
191 if isinstance(interval, Comment): | |
192 continue | |
193 # Determine if interval is part of feature. | |
194 part_of = False | |
195 group = interval.attributes.get('group', None) | |
196 # GFF test: | |
197 if group and feature_group == group: | |
198 part_of = True | |
199 # GFF3 test: | |
200 parent_id = interval.attributes.get('Parent', None) | |
201 cur_id = interval.attributes.get('ID', None) | |
202 if (cur_id and cur_id == feature_id) or (parent_id and parent_id == feature_id): | |
203 part_of = True | |
204 # GTF test: | |
205 transcript_id = interval.attributes.get('transcript_id', None) | |
206 if transcript_id and transcript_id == feature_transcript_id: | |
207 part_of = True | |
208 # If interval is not part of feature, clean up and break. | |
209 if not part_of: | |
210 # Adjust raw size because current line is not part of feature. | |
211 raw_size -= len(self.current_line) | |
212 break | |
213 # Interval associated with feature. | |
214 feature_intervals.append(interval) | |
215 # Last interval read is the seed for the next interval. | |
216 self.seed_interval = interval | |
217 self.seed_interval_line_len = len(self.current_line) | |
218 # Return feature. | |
219 feature = GFFFeature(self, self.chrom_col, self.feature_col, self.start_col, | |
220 self.end_col, self.strand_col, self.score_col, | |
221 self.default_strand, fix_strand=self.fix_strand, | |
222 intervals=feature_intervals, raw_size=raw_size) | |
223 # Convert to BED coords? | |
224 if self.convert_to_bed_coord: | |
225 convert_gff_coords_to_bed(feature) | |
226 return feature | |
227 | |
228 | |
229 def convert_bed_coords_to_gff(interval): | |
230 """ | |
231 Converts an interval object's coordinates from BED format to GFF format. | |
232 Accepted object types include GenomicInterval and list (where the first | |
233 element in the list is the interval's start, and the second element is | |
234 the interval's end). | |
235 """ | |
236 if isinstance(interval, GenomicInterval): | |
237 interval.start += 1 | |
238 if isinstance(interval, GFFFeature): | |
239 for subinterval in interval.intervals: | |
240 convert_bed_coords_to_gff(subinterval) | |
241 elif isinstance(interval, list): | |
242 interval[0] += 1 | |
243 return interval | |
244 | |
245 | |
246 def convert_gff_coords_to_bed(interval): | |
247 """ | |
248 Converts an interval object's coordinates from GFF format to BED format. | |
249 Accepted object types include GFFFeature, GenomicInterval, and list (where | |
250 the first element in the list is the interval's start, and the second | |
251 element is the interval's end). | |
252 """ | |
253 if isinstance(interval, GenomicInterval): | |
254 interval.start -= 1 | |
255 if isinstance(interval, GFFFeature): | |
256 for subinterval in interval.intervals: | |
257 convert_gff_coords_to_bed(subinterval) | |
258 elif isinstance(interval, list): | |
259 interval[0] -= 1 | |
260 return interval | |
261 | |
262 | |
263 def convert_to_twobit(reference_genome): | |
264 """ | |
265 Create 2bit file history fasta dataset. | |
266 """ | |
267 try: | |
268 seq_path = tempfile.NamedTemporaryFile(dir=".").name | |
269 cmd = "faToTwoBit %s %s" % (reference_genome, seq_path) | |
270 tmp_name = tempfile.NamedTemporaryFile(dir=".").name | |
271 tmp_stderr = open(tmp_name, 'wb') | |
272 proc = subprocess.Popen(args=cmd, shell=True, stderr=tmp_stderr.fileno()) | |
273 returncode = proc.wait() | |
274 tmp_stderr.close() | |
275 if returncode != 0: | |
276 # Get stderr, allowing for case where it's very large. | |
277 tmp_stderr = open(tmp_name, 'rb') | |
278 stderr = '' | |
279 buffsize = 1048576 | |
280 try: | |
281 while True: | |
282 stderr += tmp_stderr.read(buffsize) | |
283 if not stderr or len(stderr) % buffsize != 0: | |
284 break | |
285 except OverflowError: | |
286 pass | |
287 tmp_stderr.close() | |
288 os.remove(tmp_name) | |
289 stop_err(stderr) | |
290 return seq_path | |
291 except Exception, e: | |
292 stop_err('Error running faToTwoBit. ' + str(e)) | |
293 | |
294 | |
295 def get_lines(feature): | |
296 # Get feature's line(s). | |
297 if isinstance(feature, GFFFeature): | |
298 return feature.lines() | |
299 else: | |
300 return [feature.rstrip('\r\n')] | |
301 | |
302 | |
303 def gff_attributes_to_str(attrs, gff_format): | |
304 """ | |
305 Convert GFF attributes to string. Supported formats are GFF3, GTF. | |
306 """ | |
307 if gff_format == 'GTF': | |
308 format_string = '%s "%s"' | |
309 # Convert group (GFF) and ID, parent (GFF3) attributes to | |
310 # transcript_id, gene_id. | |
311 id_attr = None | |
312 if 'group' in attrs: | |
313 id_attr = 'group' | |
314 elif 'ID' in attrs: | |
315 id_attr = 'ID' | |
316 elif 'Parent' in attrs: | |
317 id_attr = 'Parent' | |
318 if id_attr: | |
319 attrs['transcript_id'] = attrs['gene_id'] = attrs[id_attr] | |
320 elif gff_format == 'GFF3': | |
321 format_string = '%s=%s' | |
322 attrs_strs = [] | |
323 for name, value in attrs.items(): | |
324 attrs_strs.append(format_string % (name, value)) | |
325 return " ; ".join(attrs_strs) | |
326 | |
327 | |
328 def parse_cols_arg(cols): | |
329 """ | |
330 Parse a columns command line argument into a four-tuple. | |
331 """ | |
332 if cols: | |
333 # Handle case where no strand column included - in this case, cols | |
334 # looks something like 1,2,3, | |
335 if cols.endswith(','): | |
336 cols += '0' | |
337 col_list = map(lambda x: int(x) - 1, cols.split(",")) | |
338 return col_list | |
339 else: | |
340 return BED_DEFAULT_COLS | |
341 | |
342 | |
343 def parse_gff_attributes(attr_str): | |
344 """ | |
345 Parses a GFF/GTF attribute string and returns a dictionary of name-value | |
346 pairs. The general format for a GFF3 attributes string is | |
347 name1=value1;name2=value2 | |
348 The general format for a GTF attribute string is | |
349 name1 "value1" ; name2 "value2" | |
350 The general format for a GFF attribute string is a single string that | |
351 denotes the interval's group; in this case, method returns a dictionary | |
352 with a single key-value pair, and key name is 'group'. | |
353 """ | |
354 attributes_list = attr_str.split(";") | |
355 attributes = {} | |
356 for name_value_pair in attributes_list: | |
357 # Try splitting by '=' (GFF3) first because spaces are allowed in GFF3 | |
358 # attribute; next, try double quotes for GTF. | |
359 pair = name_value_pair.strip().split("=") | |
360 if len(pair) == 1: | |
361 pair = name_value_pair.strip().split("\"") | |
362 if len(pair) == 1: | |
363 # Could not split for some reason. | |
364 continue | |
365 if pair == '': | |
366 continue | |
367 name = pair[0].strip() | |
368 if name == '': | |
369 continue | |
370 # Need to strip double quote from values | |
371 value = pair[1].strip(" \"") | |
372 attributes[name] = value | |
373 if len(attributes) == 0: | |
374 # Could not split attributes string, so entire string must be | |
375 # 'group' attribute. This is the case for strictly GFF files. | |
376 attributes['group'] = attr_str | |
377 return attributes | |
378 | |
379 | |
380 def reverse_complement(s): | |
381 complement_dna = {"A": "T", "T": "A", "C": "G", "G": "C", "a": "t", "t": "a", "c": "g", "g": "c", "N": "N", "n": "n"} | |
382 reversed_s = [] | |
383 for i in s: | |
384 reversed_s.append(complement_dna[i]) | |
385 reversed_s.reverse() | |
386 return "".join(reversed_s) | |
387 | |
388 | |
389 def stop_err(msg): | |
390 sys.stderr.write(msg) | |
391 sys.exit(1) |