diff gff3_rebase.py @ 3:7342f467507b draft

Uploaded v0.4 of JBrowse
author iuc
date Thu, 31 Dec 2015 13:58:43 -0500
parents 497c6bb3b717
children ad4b9d7eae6a
line wrap: on
line diff
--- a/gff3_rebase.py	Tue Jun 23 12:10:15 2015 -0400
+++ b/gff3_rebase.py	Thu Dec 31 13:58:43 2015 -0500
@@ -3,6 +3,7 @@
 import logging
 logging.basicConfig(level=logging.INFO)
 import argparse
+import copy
 from BCBio import GFF
 from Bio.SeqFeature import FeatureLocation
 log = logging.getLogger(__name__)
@@ -13,6 +14,70 @@
 __email__ = "esr@tamu.edu"
 
 
+def feature_lambda(feature_list, test, test_kwargs, subfeatures=True):
+    """Recursively search through features, testing each with a test function, yielding matches.
+
+    GFF3 is a hierachical data structure, so we need to be able to recursively
+    search through features. E.g. if you're looking for a feature with
+    ID='bob.42', you can't just do a simple list comprehension with a test
+    case. You don't know how deeply burried bob.42 will be in the feature tree. This is where feature_lambda steps in.
+
+    :type feature_list: list
+    :param feature_list: an iterable of features
+
+    :type test: function reference
+    :param test: a closure with the method signature (feature, **kwargs) where
+                 the kwargs are those passed in the next argument. This
+                 function should return True or False, True if the feature is
+                 to be yielded as part of the main feature_lambda function, or
+                 False if it is to be ignored. This function CAN mutate the
+                 features passed to it (think "apply").
+
+    :type test_kwargs: dictionary
+    :param test_kwargs: kwargs to pass to your closure when it is called.
+
+    :type subfeatures: boolean
+    :param subfeatures: when a feature is matched, should just that feature be
+                        yielded to the caller, or should the entire sub_feature
+                        tree for that feature be included? subfeatures=True is
+                        useful in cases such as searching for a gene feature,
+                        and wanting to know what RBS/Shine_Dalgarno_sequences
+                        are in the sub_feature tree (which can be accomplished
+                        with two feature_lambda calls). subfeatures=False is
+                        useful in cases when you want to process (and possibly
+                        return) the entire feature tree, such as applying a
+                        qualifier to every single feature.
+
+    :rtype: yielded list
+    :return: Yields a list of matching features.
+    """
+    # Either the top level set of [features] or the subfeature attribute
+    for feature in feature_list:
+        if test(feature, **test_kwargs):
+            if not subfeatures:
+                feature_copy = copy.deepcopy(feature)
+                feature_copy.sub_features = []
+                yield feature_copy
+            else:
+                yield feature
+
+        if hasattr(feature, 'sub_features'):
+            for x in feature_lambda(feature.sub_features, test, test_kwargs, subfeatures=subfeatures):
+                yield x
+
+
+def feature_test_qual_value(feature, **kwargs):
+    """Test qualifier values.
+
+    For every feature, check that at least one value in
+    feature.quailfiers(kwargs['qualifier']) is in kwargs['attribute_list']
+    """
+    for attribute_value in feature.qualifiers.get(kwargs['qualifier'], []):
+        if attribute_value in kwargs['attribute_list']:
+            return True
+    return False
+
+
 def __get_features(child, interpro=False):
     child_features = {}
     for rec in GFF.parse(child):
@@ -69,30 +134,35 @@
     child_features = __get_features(child, interpro=interpro)
 
     for rec in GFF.parse(parent):
-        # TODO, replace with recursion in case it's matched against a
-        # non-parent feature. We're cheating a bit here right now...
         replacement_features = []
-        for feature in rec.features:
-            if feature.id in child_features:
-                new_subfeatures = child_features[feature.id]
-                # TODO: update starts
-                fixed_subfeatures = []
-                for x in new_subfeatures:
-                    # Then update the location of the actual feature
-                    __update_feature_location(x, feature, protein2dna)
+        for feature in feature_lambda(
+                rec.features,
+                feature_test_qual_value,
+                {
+                    'qualifier': 'ID',
+                    'attribute_list': child_features.keys(),
+                },
+                subfeatures=False):
 
-                    if interpro:
-                        for y in ('status', 'Target'):
-                            try:
-                                del x.qualifiers[y]
-                            except:
-                                pass
+            new_subfeatures = child_features[feature.id]
+            fixed_subfeatures = []
+            for x in new_subfeatures:
+                # Then update the location of the actual feature
+                __update_feature_location(x, feature, protein2dna)
 
-                    fixed_subfeatures.append(x)
-                replacement_features.extend(fixed_subfeatures)
+                if interpro:
+                    for y in ('status', 'Target'):
+                        try:
+                            del x.qualifiers[y]
+                        except:
+                            pass
+
+                fixed_subfeatures.append(x)
+            replacement_features.extend(fixed_subfeatures)
         # We do this so we don't include the original set of features that we
         # were rebasing against in our result.
         rec.features = replacement_features
+        rec.annotations = {}
         GFF.write([rec], sys.stdout)