Mercurial > repos > iuc > jbrowse
diff gff3_rebase.py @ 3:7342f467507b draft
Uploaded v0.4 of JBrowse
author | iuc |
---|---|
date | Thu, 31 Dec 2015 13:58:43 -0500 |
parents | 497c6bb3b717 |
children | ad4b9d7eae6a |
line wrap: on
line diff
--- a/gff3_rebase.py Tue Jun 23 12:10:15 2015 -0400 +++ b/gff3_rebase.py Thu Dec 31 13:58:43 2015 -0500 @@ -3,6 +3,7 @@ import logging logging.basicConfig(level=logging.INFO) import argparse +import copy from BCBio import GFF from Bio.SeqFeature import FeatureLocation log = logging.getLogger(__name__) @@ -13,6 +14,70 @@ __email__ = "esr@tamu.edu" +def feature_lambda(feature_list, test, test_kwargs, subfeatures=True): + """Recursively search through features, testing each with a test function, yielding matches. + + GFF3 is a hierachical data structure, so we need to be able to recursively + search through features. E.g. if you're looking for a feature with + ID='bob.42', you can't just do a simple list comprehension with a test + case. You don't know how deeply burried bob.42 will be in the feature tree. This is where feature_lambda steps in. + + :type feature_list: list + :param feature_list: an iterable of features + + :type test: function reference + :param test: a closure with the method signature (feature, **kwargs) where + the kwargs are those passed in the next argument. This + function should return True or False, True if the feature is + to be yielded as part of the main feature_lambda function, or + False if it is to be ignored. This function CAN mutate the + features passed to it (think "apply"). + + :type test_kwargs: dictionary + :param test_kwargs: kwargs to pass to your closure when it is called. + + :type subfeatures: boolean + :param subfeatures: when a feature is matched, should just that feature be + yielded to the caller, or should the entire sub_feature + tree for that feature be included? subfeatures=True is + useful in cases such as searching for a gene feature, + and wanting to know what RBS/Shine_Dalgarno_sequences + are in the sub_feature tree (which can be accomplished + with two feature_lambda calls). subfeatures=False is + useful in cases when you want to process (and possibly + return) the entire feature tree, such as applying a + qualifier to every single feature. + + :rtype: yielded list + :return: Yields a list of matching features. + """ + # Either the top level set of [features] or the subfeature attribute + for feature in feature_list: + if test(feature, **test_kwargs): + if not subfeatures: + feature_copy = copy.deepcopy(feature) + feature_copy.sub_features = [] + yield feature_copy + else: + yield feature + + if hasattr(feature, 'sub_features'): + for x in feature_lambda(feature.sub_features, test, test_kwargs, subfeatures=subfeatures): + yield x + + +def feature_test_qual_value(feature, **kwargs): + """Test qualifier values. + + For every feature, check that at least one value in + feature.quailfiers(kwargs['qualifier']) is in kwargs['attribute_list'] + """ + for attribute_value in feature.qualifiers.get(kwargs['qualifier'], []): + if attribute_value in kwargs['attribute_list']: + return True + return False + + def __get_features(child, interpro=False): child_features = {} for rec in GFF.parse(child): @@ -69,30 +134,35 @@ child_features = __get_features(child, interpro=interpro) for rec in GFF.parse(parent): - # TODO, replace with recursion in case it's matched against a - # non-parent feature. We're cheating a bit here right now... replacement_features = [] - for feature in rec.features: - if feature.id in child_features: - new_subfeatures = child_features[feature.id] - # TODO: update starts - fixed_subfeatures = [] - for x in new_subfeatures: - # Then update the location of the actual feature - __update_feature_location(x, feature, protein2dna) + for feature in feature_lambda( + rec.features, + feature_test_qual_value, + { + 'qualifier': 'ID', + 'attribute_list': child_features.keys(), + }, + subfeatures=False): - if interpro: - for y in ('status', 'Target'): - try: - del x.qualifiers[y] - except: - pass + new_subfeatures = child_features[feature.id] + fixed_subfeatures = [] + for x in new_subfeatures: + # Then update the location of the actual feature + __update_feature_location(x, feature, protein2dna) - fixed_subfeatures.append(x) - replacement_features.extend(fixed_subfeatures) + if interpro: + for y in ('status', 'Target'): + try: + del x.qualifiers[y] + except: + pass + + fixed_subfeatures.append(x) + replacement_features.extend(fixed_subfeatures) # We do this so we don't include the original set of features that we # were rebasing against in our result. rec.features = replacement_features + rec.annotations = {} GFF.write([rec], sys.stdout)