comparison gff3_rebase.py @ 1:497c6bb3b717 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/jbrowse commit 0887009a23d176b21536c9fd8a18c4fecc417d4f
author iuc
date Thu, 18 Jun 2015 12:10:51 -0400
parents
children 7342f467507b
comparison
equal deleted inserted replaced
0:2c9e5136b416 1:497c6bb3b717
1 #!/usr/bin/env python
2 import sys
3 import logging
4 logging.basicConfig(level=logging.INFO)
5 import argparse
6 from BCBio import GFF
7 from Bio.SeqFeature import FeatureLocation
8 log = logging.getLogger(__name__)
9
10 __author__ = "Eric Rasche"
11 __version__ = "0.4.0"
12 __maintainer__ = "Eric Rasche"
13 __email__ = "esr@tamu.edu"
14
15
16 def __get_features(child, interpro=False):
17 child_features = {}
18 for rec in GFF.parse(child):
19 for feature in rec.features:
20 parent_feature_id = rec.id
21 if interpro:
22 if feature.type == 'polypeptide':
23 continue
24 if '_' in parent_feature_id:
25 parent_feature_id = parent_feature_id[parent_feature_id.index('_') + 1:]
26
27 try:
28 child_features[parent_feature_id].append(feature)
29 except KeyError:
30 child_features[parent_feature_id] = [feature]
31 return child_features
32
33
34 def __update_feature_location(feature, parent, protein2dna):
35 start = feature.location.start
36 end = feature.location.end
37 if protein2dna:
38 start *= 3
39 end *= 3
40
41 if parent.location.strand >= 0:
42 ns = parent.location.start + start
43 ne = parent.location.start + end
44 st = +1
45 else:
46 ns = parent.location.end - end
47 ne = parent.location.end - start
48 st = -1
49
50 # Don't let start/stops be less than zero. It's technically valid for them
51 # to be (at least in the model I'm working with) but it causes numerous
52 # issues.
53 #
54 # Instead, we'll replace with %3 to try and keep it in the same reading
55 # frame that it should be in.
56 if ns < 0:
57 ns %= 3
58 if ne < 0:
59 ne %= 3
60
61 feature.location = FeatureLocation(ns, ne, strand=st)
62
63 if hasattr(feature, 'sub_features'):
64 for subfeature in feature.sub_features:
65 __update_feature_location(subfeature, parent, protein2dna)
66
67
68 def rebase(parent, child, interpro=False, protein2dna=False):
69 child_features = __get_features(child, interpro=interpro)
70
71 for rec in GFF.parse(parent):
72 # TODO, replace with recursion in case it's matched against a
73 # non-parent feature. We're cheating a bit here right now...
74 replacement_features = []
75 for feature in rec.features:
76 if feature.id in child_features:
77 new_subfeatures = child_features[feature.id]
78 # TODO: update starts
79 fixed_subfeatures = []
80 for x in new_subfeatures:
81 # Then update the location of the actual feature
82 __update_feature_location(x, feature, protein2dna)
83
84 if interpro:
85 for y in ('status', 'Target'):
86 try:
87 del x.qualifiers[y]
88 except:
89 pass
90
91 fixed_subfeatures.append(x)
92 replacement_features.extend(fixed_subfeatures)
93 # We do this so we don't include the original set of features that we
94 # were rebasing against in our result.
95 rec.features = replacement_features
96 GFF.write([rec], sys.stdout)
97
98
99 if __name__ == '__main__':
100 parser = argparse.ArgumentParser(description='rebase gff3 features against parent locations', epilog="")
101 parser.add_argument('parent', type=file, help='Parent GFF3 annotations')
102 parser.add_argument('child', help='Child GFF3 annotations to rebase against parent')
103 parser.add_argument('--interpro', action='store_true',
104 help='Interpro specific modifications')
105 parser.add_argument('--protein2dna', action='store_true',
106 help='Map protein translated results to original DNA data')
107 args = parser.parse_args()
108 rebase(**vars(args))