comparison gff3_rebase.py @ 0:e54940ea270c draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gff3_rebase commit 350ab347625ed5941873ba0deb3e1cf219d60052
author iuc
date Thu, 20 Apr 2017 08:12:49 -0400
parents
children ea35a85b941d
comparison
equal deleted inserted replaced
-1:000000000000 0:e54940ea270c
1 #!/usr/bin/env python
2 import argparse
3 import copy
4 import logging
5 import sys
6
7 from BCBio import GFF
8 from Bio.SeqFeature import FeatureLocation
9
10 logging.basicConfig(level=logging.INFO)
11 log = logging.getLogger(__name__)
12
13 __author__ = "Eric Rasche"
14 __version__ = "0.4.0"
15 __maintainer__ = "Eric Rasche"
16 __email__ = "esr@tamu.edu"
17
18
19 def feature_lambda(feature_list, test, test_kwargs, subfeatures=True):
20 """Recursively search through features, testing each with a test function, yielding matches.
21
22 GFF3 is a hierachical data structure, so we need to be able to recursively
23 search through features. E.g. if you're looking for a feature with
24 ID='bob.42', you can't just do a simple list comprehension with a test
25 case. You don't know how deeply burried bob.42 will be in the feature tree. This is where feature_lambda steps in.
26
27 :type feature_list: list
28 :param feature_list: an iterable of features
29
30 :type test: function reference
31 :param test: a closure with the method signature (feature, **kwargs) where
32 the kwargs are those passed in the next argument. This
33 function should return True or False, True if the feature is
34 to be yielded as part of the main feature_lambda function, or
35 False if it is to be ignored. This function CAN mutate the
36 features passed to it (think "apply").
37
38 :type test_kwargs: dictionary
39 :param test_kwargs: kwargs to pass to your closure when it is called.
40
41 :type subfeatures: boolean
42 :param subfeatures: when a feature is matched, should just that feature be
43 yielded to the caller, or should the entire sub_feature
44 tree for that feature be included? subfeatures=True is
45 useful in cases such as searching for a gene feature,
46 and wanting to know what RBS/Shine_Dalgarno_sequences
47 are in the sub_feature tree (which can be accomplished
48 with two feature_lambda calls). subfeatures=False is
49 useful in cases when you want to process (and possibly
50 return) the entire feature tree, such as applying a
51 qualifier to every single feature.
52
53 :rtype: yielded list
54 :return: Yields a list of matching features.
55 """
56 # Either the top level set of [features] or the subfeature attribute
57 for feature in feature_list:
58 if test(feature, **test_kwargs):
59 if not subfeatures:
60 feature_copy = copy.deepcopy(feature)
61 feature_copy.sub_features = []
62 yield feature_copy
63 else:
64 yield feature
65
66 if hasattr(feature, 'sub_features'):
67 for x in feature_lambda(feature.sub_features, test, test_kwargs, subfeatures=subfeatures):
68 yield x
69
70
71 def feature_test_qual_value(feature, **kwargs):
72 """Test qualifier values.
73
74 For every feature, check that at least one value in
75 feature.quailfiers(kwargs['qualifier']) is in kwargs['attribute_list']
76 """
77 for attribute_value in feature.qualifiers.get(kwargs['qualifier'], []):
78 if attribute_value in kwargs['attribute_list']:
79 return True
80 return False
81
82
83 def __get_features(child, interpro=False):
84 child_features = {}
85 for rec in GFF.parse(child):
86 for feature in rec.features:
87 parent_feature_id = rec.id
88 if interpro:
89 if feature.type == 'polypeptide':
90 continue
91 if '_' in parent_feature_id:
92 parent_feature_id = parent_feature_id[parent_feature_id.index('_') + 1:]
93
94 try:
95 child_features[parent_feature_id].append(feature)
96 except KeyError:
97 child_features[parent_feature_id] = [feature]
98 return child_features
99
100
101 def __update_feature_location(feature, parent, protein2dna):
102 start = feature.location.start
103 end = feature.location.end
104 if protein2dna:
105 start *= 3
106 end *= 3
107
108 if parent.location.strand >= 0:
109 ns = parent.location.start + start
110 ne = parent.location.start + end
111 st = +1
112 else:
113 ns = parent.location.end - end
114 ne = parent.location.end - start
115 st = -1
116
117 # Don't let start/stops be less than zero. It's technically valid for them
118 # to be (at least in the model I'm working with) but it causes numerous
119 # issues.
120 #
121 # Instead, we'll replace with %3 to try and keep it in the same reading
122 # frame that it should be in.
123 if ns < 0:
124 ns %= 3
125 if ne < 0:
126 ne %= 3
127
128 feature.location = FeatureLocation(ns, ne, strand=st)
129
130 if hasattr(feature, 'sub_features'):
131 for subfeature in feature.sub_features:
132 __update_feature_location(subfeature, parent, protein2dna)
133
134
135 def rebase(parent, child, interpro=False, protein2dna=False):
136 child_features = __get_features(child, interpro=interpro)
137
138 for rec in GFF.parse(parent):
139 replacement_features = []
140 for feature in feature_lambda(
141 rec.features,
142 feature_test_qual_value,
143 {
144 'qualifier': 'ID',
145 'attribute_list': child_features.keys(),
146 },
147 subfeatures=False):
148
149 new_subfeatures = child_features[feature.id]
150 fixed_subfeatures = []
151 for x in new_subfeatures:
152 # Then update the location of the actual feature
153 __update_feature_location(x, feature, protein2dna)
154
155 if interpro:
156 for y in ('status', 'Target'):
157 try:
158 del x.qualifiers[y]
159 except:
160 pass
161
162 fixed_subfeatures.append(x)
163 replacement_features.extend(fixed_subfeatures)
164 # We do this so we don't include the original set of features that we
165 # were rebasing against in our result.
166 rec.features = replacement_features
167 rec.annotations = {}
168 GFF.write([rec], sys.stdout)
169
170
171 if __name__ == '__main__':
172 parser = argparse.ArgumentParser(description='rebase gff3 features against parent locations', epilog="")
173 parser.add_argument('parent', type=argparse.FileType('r'), help='Parent GFF3 annotations')
174 parser.add_argument('child', type=argparse.FileType('r'), help='Child GFF3 annotations to rebase against parent')
175 parser.add_argument('--interpro', action='store_true',
176 help='Interpro specific modifications')
177 parser.add_argument('--protein2dna', action='store_true',
178 help='Map protein translated results to original DNA data')
179 args = parser.parse_args()
180 rebase(**vars(args))