diff gff3_rebase.py @ 1:497c6bb3b717 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/jbrowse commit 0887009a23d176b21536c9fd8a18c4fecc417d4f
author iuc
date Thu, 18 Jun 2015 12:10:51 -0400
parents
children 7342f467507b
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gff3_rebase.py	Thu Jun 18 12:10:51 2015 -0400
@@ -0,0 +1,108 @@
+#!/usr/bin/env python
+import sys
+import logging
+logging.basicConfig(level=logging.INFO)
+import argparse
+from BCBio import GFF
+from Bio.SeqFeature import FeatureLocation
+log = logging.getLogger(__name__)
+
+__author__ = "Eric Rasche"
+__version__ = "0.4.0"
+__maintainer__ = "Eric Rasche"
+__email__ = "esr@tamu.edu"
+
+
+def __get_features(child, interpro=False):
+    child_features = {}
+    for rec in GFF.parse(child):
+        for feature in rec.features:
+            parent_feature_id = rec.id
+            if interpro:
+                if feature.type == 'polypeptide':
+                    continue
+                if '_' in parent_feature_id:
+                    parent_feature_id = parent_feature_id[parent_feature_id.index('_') + 1:]
+
+            try:
+                child_features[parent_feature_id].append(feature)
+            except KeyError:
+                child_features[parent_feature_id] = [feature]
+    return child_features
+
+
+def __update_feature_location(feature, parent, protein2dna):
+    start = feature.location.start
+    end = feature.location.end
+    if protein2dna:
+        start *= 3
+        end *= 3
+
+    if parent.location.strand >= 0:
+        ns = parent.location.start + start
+        ne = parent.location.start + end
+        st = +1
+    else:
+        ns = parent.location.end - end
+        ne = parent.location.end - start
+        st = -1
+
+    # Don't let start/stops be less than zero. It's technically valid for them
+    # to be (at least in the model I'm working with) but it causes numerous
+    # issues.
+    #
+    # Instead, we'll replace with %3 to try and keep it in the same reading
+    # frame that it should be in.
+    if ns < 0:
+        ns %= 3
+    if ne < 0:
+        ne %= 3
+
+    feature.location = FeatureLocation(ns, ne, strand=st)
+
+    if hasattr(feature, 'sub_features'):
+        for subfeature in feature.sub_features:
+            __update_feature_location(subfeature, parent, protein2dna)
+
+
+def rebase(parent, child, interpro=False, protein2dna=False):
+    child_features = __get_features(child, interpro=interpro)
+
+    for rec in GFF.parse(parent):
+        # TODO, replace with recursion in case it's matched against a
+        # non-parent feature. We're cheating a bit here right now...
+        replacement_features = []
+        for feature in rec.features:
+            if feature.id in child_features:
+                new_subfeatures = child_features[feature.id]
+                # TODO: update starts
+                fixed_subfeatures = []
+                for x in new_subfeatures:
+                    # Then update the location of the actual feature
+                    __update_feature_location(x, feature, protein2dna)
+
+                    if interpro:
+                        for y in ('status', 'Target'):
+                            try:
+                                del x.qualifiers[y]
+                            except:
+                                pass
+
+                    fixed_subfeatures.append(x)
+                replacement_features.extend(fixed_subfeatures)
+        # We do this so we don't include the original set of features that we
+        # were rebasing against in our result.
+        rec.features = replacement_features
+        GFF.write([rec], sys.stdout)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='rebase gff3 features against parent locations', epilog="")
+    parser.add_argument('parent', type=file, help='Parent GFF3 annotations')
+    parser.add_argument('child', help='Child GFF3 annotations to rebase against parent')
+    parser.add_argument('--interpro', action='store_true',
+                        help='Interpro specific modifications')
+    parser.add_argument('--protein2dna', action='store_true',
+                        help='Map protein translated results to original DNA data')
+    args = parser.parse_args()
+    rebase(**vars(args))