# HG changeset patch
# User iuc
# Date 1492690369 14400
# Node ID e54940ea270c87e83a9749d1124041dbd0f62f97
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gff3_rebase commit 350ab347625ed5941873ba0deb3e1cf219d60052
diff -r 000000000000 -r e54940ea270c gff3_rebase.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gff3_rebase.py Thu Apr 20 08:12:49 2017 -0400
@@ -0,0 +1,180 @@
+#!/usr/bin/env python
+import argparse
+import copy
+import logging
+import sys
+
+from BCBio import GFF
+from Bio.SeqFeature import FeatureLocation
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger(__name__)
+
+__author__ = "Eric Rasche"
+__version__ = "0.4.0"
+__maintainer__ = "Eric Rasche"
+__email__ = "esr@tamu.edu"
+
+
+def feature_lambda(feature_list, test, test_kwargs, subfeatures=True):
+ """Recursively search through features, testing each with a test function, yielding matches.
+
+ GFF3 is a hierachical data structure, so we need to be able to recursively
+ search through features. E.g. if you're looking for a feature with
+ ID='bob.42', you can't just do a simple list comprehension with a test
+ case. You don't know how deeply burried bob.42 will be in the feature tree. This is where feature_lambda steps in.
+
+ :type feature_list: list
+ :param feature_list: an iterable of features
+
+ :type test: function reference
+ :param test: a closure with the method signature (feature, **kwargs) where
+ the kwargs are those passed in the next argument. This
+ function should return True or False, True if the feature is
+ to be yielded as part of the main feature_lambda function, or
+ False if it is to be ignored. This function CAN mutate the
+ features passed to it (think "apply").
+
+ :type test_kwargs: dictionary
+ :param test_kwargs: kwargs to pass to your closure when it is called.
+
+ :type subfeatures: boolean
+ :param subfeatures: when a feature is matched, should just that feature be
+ yielded to the caller, or should the entire sub_feature
+ tree for that feature be included? subfeatures=True is
+ useful in cases such as searching for a gene feature,
+ and wanting to know what RBS/Shine_Dalgarno_sequences
+ are in the sub_feature tree (which can be accomplished
+ with two feature_lambda calls). subfeatures=False is
+ useful in cases when you want to process (and possibly
+ return) the entire feature tree, such as applying a
+ qualifier to every single feature.
+
+ :rtype: yielded list
+ :return: Yields a list of matching features.
+ """
+ # Either the top level set of [features] or the subfeature attribute
+ for feature in feature_list:
+ if test(feature, **test_kwargs):
+ if not subfeatures:
+ feature_copy = copy.deepcopy(feature)
+ feature_copy.sub_features = []
+ yield feature_copy
+ else:
+ yield feature
+
+ if hasattr(feature, 'sub_features'):
+ for x in feature_lambda(feature.sub_features, test, test_kwargs, subfeatures=subfeatures):
+ yield x
+
+
+def feature_test_qual_value(feature, **kwargs):
+ """Test qualifier values.
+
+ For every feature, check that at least one value in
+ feature.quailfiers(kwargs['qualifier']) is in kwargs['attribute_list']
+ """
+ for attribute_value in feature.qualifiers.get(kwargs['qualifier'], []):
+ if attribute_value in kwargs['attribute_list']:
+ return True
+ return False
+
+
+def __get_features(child, interpro=False):
+ child_features = {}
+ for rec in GFF.parse(child):
+ for feature in rec.features:
+ parent_feature_id = rec.id
+ if interpro:
+ if feature.type == 'polypeptide':
+ continue
+ if '_' in parent_feature_id:
+ parent_feature_id = parent_feature_id[parent_feature_id.index('_') + 1:]
+
+ try:
+ child_features[parent_feature_id].append(feature)
+ except KeyError:
+ child_features[parent_feature_id] = [feature]
+ return child_features
+
+
+def __update_feature_location(feature, parent, protein2dna):
+ start = feature.location.start
+ end = feature.location.end
+ if protein2dna:
+ start *= 3
+ end *= 3
+
+ if parent.location.strand >= 0:
+ ns = parent.location.start + start
+ ne = parent.location.start + end
+ st = +1
+ else:
+ ns = parent.location.end - end
+ ne = parent.location.end - start
+ st = -1
+
+ # Don't let start/stops be less than zero. It's technically valid for them
+ # to be (at least in the model I'm working with) but it causes numerous
+ # issues.
+ #
+ # Instead, we'll replace with %3 to try and keep it in the same reading
+ # frame that it should be in.
+ if ns < 0:
+ ns %= 3
+ if ne < 0:
+ ne %= 3
+
+ feature.location = FeatureLocation(ns, ne, strand=st)
+
+ if hasattr(feature, 'sub_features'):
+ for subfeature in feature.sub_features:
+ __update_feature_location(subfeature, parent, protein2dna)
+
+
+def rebase(parent, child, interpro=False, protein2dna=False):
+ child_features = __get_features(child, interpro=interpro)
+
+ for rec in GFF.parse(parent):
+ replacement_features = []
+ for feature in feature_lambda(
+ rec.features,
+ feature_test_qual_value,
+ {
+ 'qualifier': 'ID',
+ 'attribute_list': child_features.keys(),
+ },
+ subfeatures=False):
+
+ new_subfeatures = child_features[feature.id]
+ fixed_subfeatures = []
+ for x in new_subfeatures:
+ # Then update the location of the actual feature
+ __update_feature_location(x, feature, protein2dna)
+
+ if interpro:
+ for y in ('status', 'Target'):
+ try:
+ del x.qualifiers[y]
+ except:
+ pass
+
+ fixed_subfeatures.append(x)
+ replacement_features.extend(fixed_subfeatures)
+ # We do this so we don't include the original set of features that we
+ # were rebasing against in our result.
+ rec.features = replacement_features
+ rec.annotations = {}
+ GFF.write([rec], sys.stdout)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='rebase gff3 features against parent locations', epilog="")
+ parser.add_argument('parent', type=argparse.FileType('r'), help='Parent GFF3 annotations')
+ parser.add_argument('child', type=argparse.FileType('r'), help='Child GFF3 annotations to rebase against parent')
+ parser.add_argument('--interpro', action='store_true',
+ help='Interpro specific modifications')
+ parser.add_argument('--protein2dna', action='store_true',
+ help='Map protein translated results to original DNA data')
+ args = parser.parse_args()
+ rebase(**vars(args))
diff -r 000000000000 -r e54940ea270c gff3_rebase.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gff3_rebase.xml Thu Apr 20 08:12:49 2017 -0400
@@ -0,0 +1,93 @@
+
+ against parent features
+
+ macros.xml
+
+
+ python gff3_rebase.py --version
+ '$output']]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ cds42
+ M......
+
+and then analysed those results, producing the "child" annotation file. This
+tool will then localize the results properly against the parent::
+
+ #gff-version 3
+ PhageBob blastp match_part 300 449 1e-40 + . ID=m00001;Notes=RNAse A Protein
+
+which will allow you to display the results in the correct location in visualizations.
+
+**Options**
+
+There are two optional flags which can be passed.
+
+The Interpro flag selectively ignores features which shouldn't be included in
+the output (i.e. ``polypeptide``), and a couple of qualifiers that aren't
+useful (``status``, ``Target``)
+
+The "Map Protein..." flag says that you translated the sequences during the
+genomic export process, analysing protein sequences. This indicates to the
+software that the bases should be multiplied by three to obtain the correct DNA
+locations.
+]]>
+
+ 10.1093/bioinformatics/btp163
+
+
diff -r 000000000000 -r e54940ea270c macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Thu Apr 20 08:12:49 2017 -0400
@@ -0,0 +1,9 @@
+
+
+
+
+ bcbiogff
+
+
+
+
diff -r 000000000000 -r e54940ea270c test-data/child.gff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/child.gff Thu Apr 20 08:12:49 2017 -0400
@@ -0,0 +1,2 @@
+#gff-version 3
+cds42 blastp match_part 1 50 1e-40 . . ID=m00001;Notes=RNAse A Protein
diff -r 000000000000 -r e54940ea270c test-data/parent.gff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/parent.gff Thu Apr 20 08:12:49 2017 -0400
@@ -0,0 +1,2 @@
+#gff-version 3
+PhageBob maker cds 300 500 . + . ID=cds42
diff -r 000000000000 -r e54940ea270c test-data/proteins.gff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/proteins.gff Thu Apr 20 08:12:49 2017 -0400
@@ -0,0 +1,3 @@
+##gff-version 3
+##sequence-region PhageBob 1 500
+PhageBob blastp match_part 300 449 1e-40 + . ID=m00001;Notes=RNAse A Protein