comparison gff3_rebase.py @ 3:7342f467507b draft

Uploaded v0.4 of JBrowse
author iuc
date Thu, 31 Dec 2015 13:58:43 -0500
parents 497c6bb3b717
children ad4b9d7eae6a
comparison
equal deleted inserted replaced
2:b6a0e126dbee 3:7342f467507b
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 import sys 2 import sys
3 import logging 3 import logging
4 logging.basicConfig(level=logging.INFO) 4 logging.basicConfig(level=logging.INFO)
5 import argparse 5 import argparse
6 import copy
6 from BCBio import GFF 7 from BCBio import GFF
7 from Bio.SeqFeature import FeatureLocation 8 from Bio.SeqFeature import FeatureLocation
8 log = logging.getLogger(__name__) 9 log = logging.getLogger(__name__)
9 10
10 __author__ = "Eric Rasche" 11 __author__ = "Eric Rasche"
11 __version__ = "0.4.0" 12 __version__ = "0.4.0"
12 __maintainer__ = "Eric Rasche" 13 __maintainer__ = "Eric Rasche"
13 __email__ = "esr@tamu.edu" 14 __email__ = "esr@tamu.edu"
15
16
17 def feature_lambda(feature_list, test, test_kwargs, subfeatures=True):
18 """Recursively search through features, testing each with a test function, yielding matches.
19
20 GFF3 is a hierachical data structure, so we need to be able to recursively
21 search through features. E.g. if you're looking for a feature with
22 ID='bob.42', you can't just do a simple list comprehension with a test
23 case. You don't know how deeply burried bob.42 will be in the feature tree. This is where feature_lambda steps in.
24
25 :type feature_list: list
26 :param feature_list: an iterable of features
27
28 :type test: function reference
29 :param test: a closure with the method signature (feature, **kwargs) where
30 the kwargs are those passed in the next argument. This
31 function should return True or False, True if the feature is
32 to be yielded as part of the main feature_lambda function, or
33 False if it is to be ignored. This function CAN mutate the
34 features passed to it (think "apply").
35
36 :type test_kwargs: dictionary
37 :param test_kwargs: kwargs to pass to your closure when it is called.
38
39 :type subfeatures: boolean
40 :param subfeatures: when a feature is matched, should just that feature be
41 yielded to the caller, or should the entire sub_feature
42 tree for that feature be included? subfeatures=True is
43 useful in cases such as searching for a gene feature,
44 and wanting to know what RBS/Shine_Dalgarno_sequences
45 are in the sub_feature tree (which can be accomplished
46 with two feature_lambda calls). subfeatures=False is
47 useful in cases when you want to process (and possibly
48 return) the entire feature tree, such as applying a
49 qualifier to every single feature.
50
51 :rtype: yielded list
52 :return: Yields a list of matching features.
53 """
54 # Either the top level set of [features] or the subfeature attribute
55 for feature in feature_list:
56 if test(feature, **test_kwargs):
57 if not subfeatures:
58 feature_copy = copy.deepcopy(feature)
59 feature_copy.sub_features = []
60 yield feature_copy
61 else:
62 yield feature
63
64 if hasattr(feature, 'sub_features'):
65 for x in feature_lambda(feature.sub_features, test, test_kwargs, subfeatures=subfeatures):
66 yield x
67
68
69 def feature_test_qual_value(feature, **kwargs):
70 """Test qualifier values.
71
72 For every feature, check that at least one value in
73 feature.quailfiers(kwargs['qualifier']) is in kwargs['attribute_list']
74 """
75 for attribute_value in feature.qualifiers.get(kwargs['qualifier'], []):
76 if attribute_value in kwargs['attribute_list']:
77 return True
78 return False
14 79
15 80
16 def __get_features(child, interpro=False): 81 def __get_features(child, interpro=False):
17 child_features = {} 82 child_features = {}
18 for rec in GFF.parse(child): 83 for rec in GFF.parse(child):
67 132
68 def rebase(parent, child, interpro=False, protein2dna=False): 133 def rebase(parent, child, interpro=False, protein2dna=False):
69 child_features = __get_features(child, interpro=interpro) 134 child_features = __get_features(child, interpro=interpro)
70 135
71 for rec in GFF.parse(parent): 136 for rec in GFF.parse(parent):
72 # TODO, replace with recursion in case it's matched against a
73 # non-parent feature. We're cheating a bit here right now...
74 replacement_features = [] 137 replacement_features = []
75 for feature in rec.features: 138 for feature in feature_lambda(
76 if feature.id in child_features: 139 rec.features,
77 new_subfeatures = child_features[feature.id] 140 feature_test_qual_value,
78 # TODO: update starts 141 {
79 fixed_subfeatures = [] 142 'qualifier': 'ID',
80 for x in new_subfeatures: 143 'attribute_list': child_features.keys(),
81 # Then update the location of the actual feature 144 },
82 __update_feature_location(x, feature, protein2dna) 145 subfeatures=False):
83 146
84 if interpro: 147 new_subfeatures = child_features[feature.id]
85 for y in ('status', 'Target'): 148 fixed_subfeatures = []
86 try: 149 for x in new_subfeatures:
87 del x.qualifiers[y] 150 # Then update the location of the actual feature
88 except: 151 __update_feature_location(x, feature, protein2dna)
89 pass
90 152
91 fixed_subfeatures.append(x) 153 if interpro:
92 replacement_features.extend(fixed_subfeatures) 154 for y in ('status', 'Target'):
155 try:
156 del x.qualifiers[y]
157 except:
158 pass
159
160 fixed_subfeatures.append(x)
161 replacement_features.extend(fixed_subfeatures)
93 # We do this so we don't include the original set of features that we 162 # We do this so we don't include the original set of features that we
94 # were rebasing against in our result. 163 # were rebasing against in our result.
95 rec.features = replacement_features 164 rec.features = replacement_features
165 rec.annotations = {}
96 GFF.write([rec], sys.stdout) 166 GFF.write([rec], sys.stdout)
97 167
98 168
99 if __name__ == '__main__': 169 if __name__ == '__main__':
100 parser = argparse.ArgumentParser(description='rebase gff3 features against parent locations', epilog="") 170 parser = argparse.ArgumentParser(description='rebase gff3 features against parent locations', epilog="")