Mercurial > repos > iuc > gff3_rebase
changeset 0:e54940ea270c draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gff3_rebase commit 350ab347625ed5941873ba0deb3e1cf219d60052
author | iuc |
---|---|
date | Thu, 20 Apr 2017 08:12:49 -0400 |
parents | |
children | ea35a85b941d |
files | gff3_rebase.py gff3_rebase.xml macros.xml test-data/child.gff test-data/parent.gff test-data/proteins.gff |
diffstat | 6 files changed, 289 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff3_rebase.py Thu Apr 20 08:12:49 2017 -0400 @@ -0,0 +1,180 @@ +#!/usr/bin/env python +import argparse +import copy +import logging +import sys + +from BCBio import GFF +from Bio.SeqFeature import FeatureLocation + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger(__name__) + +__author__ = "Eric Rasche" +__version__ = "0.4.0" +__maintainer__ = "Eric Rasche" +__email__ = "esr@tamu.edu" + + +def feature_lambda(feature_list, test, test_kwargs, subfeatures=True): + """Recursively search through features, testing each with a test function, yielding matches. + + GFF3 is a hierachical data structure, so we need to be able to recursively + search through features. E.g. if you're looking for a feature with + ID='bob.42', you can't just do a simple list comprehension with a test + case. You don't know how deeply burried bob.42 will be in the feature tree. This is where feature_lambda steps in. + + :type feature_list: list + :param feature_list: an iterable of features + + :type test: function reference + :param test: a closure with the method signature (feature, **kwargs) where + the kwargs are those passed in the next argument. This + function should return True or False, True if the feature is + to be yielded as part of the main feature_lambda function, or + False if it is to be ignored. This function CAN mutate the + features passed to it (think "apply"). + + :type test_kwargs: dictionary + :param test_kwargs: kwargs to pass to your closure when it is called. + + :type subfeatures: boolean + :param subfeatures: when a feature is matched, should just that feature be + yielded to the caller, or should the entire sub_feature + tree for that feature be included? subfeatures=True is + useful in cases such as searching for a gene feature, + and wanting to know what RBS/Shine_Dalgarno_sequences + are in the sub_feature tree (which can be accomplished + with two feature_lambda calls). subfeatures=False is + useful in cases when you want to process (and possibly + return) the entire feature tree, such as applying a + qualifier to every single feature. + + :rtype: yielded list + :return: Yields a list of matching features. + """ + # Either the top level set of [features] or the subfeature attribute + for feature in feature_list: + if test(feature, **test_kwargs): + if not subfeatures: + feature_copy = copy.deepcopy(feature) + feature_copy.sub_features = [] + yield feature_copy + else: + yield feature + + if hasattr(feature, 'sub_features'): + for x in feature_lambda(feature.sub_features, test, test_kwargs, subfeatures=subfeatures): + yield x + + +def feature_test_qual_value(feature, **kwargs): + """Test qualifier values. + + For every feature, check that at least one value in + feature.quailfiers(kwargs['qualifier']) is in kwargs['attribute_list'] + """ + for attribute_value in feature.qualifiers.get(kwargs['qualifier'], []): + if attribute_value in kwargs['attribute_list']: + return True + return False + + +def __get_features(child, interpro=False): + child_features = {} + for rec in GFF.parse(child): + for feature in rec.features: + parent_feature_id = rec.id + if interpro: + if feature.type == 'polypeptide': + continue + if '_' in parent_feature_id: + parent_feature_id = parent_feature_id[parent_feature_id.index('_') + 1:] + + try: + child_features[parent_feature_id].append(feature) + except KeyError: + child_features[parent_feature_id] = [feature] + return child_features + + +def __update_feature_location(feature, parent, protein2dna): + start = feature.location.start + end = feature.location.end + if protein2dna: + start *= 3 + end *= 3 + + if parent.location.strand >= 0: + ns = parent.location.start + start + ne = parent.location.start + end + st = +1 + else: + ns = parent.location.end - end + ne = parent.location.end - start + st = -1 + + # Don't let start/stops be less than zero. It's technically valid for them + # to be (at least in the model I'm working with) but it causes numerous + # issues. + # + # Instead, we'll replace with %3 to try and keep it in the same reading + # frame that it should be in. + if ns < 0: + ns %= 3 + if ne < 0: + ne %= 3 + + feature.location = FeatureLocation(ns, ne, strand=st) + + if hasattr(feature, 'sub_features'): + for subfeature in feature.sub_features: + __update_feature_location(subfeature, parent, protein2dna) + + +def rebase(parent, child, interpro=False, protein2dna=False): + child_features = __get_features(child, interpro=interpro) + + for rec in GFF.parse(parent): + replacement_features = [] + for feature in feature_lambda( + rec.features, + feature_test_qual_value, + { + 'qualifier': 'ID', + 'attribute_list': child_features.keys(), + }, + subfeatures=False): + + new_subfeatures = child_features[feature.id] + fixed_subfeatures = [] + for x in new_subfeatures: + # Then update the location of the actual feature + __update_feature_location(x, feature, protein2dna) + + if interpro: + for y in ('status', 'Target'): + try: + del x.qualifiers[y] + except: + pass + + fixed_subfeatures.append(x) + replacement_features.extend(fixed_subfeatures) + # We do this so we don't include the original set of features that we + # were rebasing against in our result. + rec.features = replacement_features + rec.annotations = {} + GFF.write([rec], sys.stdout) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='rebase gff3 features against parent locations', epilog="") + parser.add_argument('parent', type=argparse.FileType('r'), help='Parent GFF3 annotations') + parser.add_argument('child', type=argparse.FileType('r'), help='Child GFF3 annotations to rebase against parent') + parser.add_argument('--interpro', action='store_true', + help='Interpro specific modifications') + parser.add_argument('--protein2dna', action='store_true', + help='Map protein translated results to original DNA data') + args = parser.parse_args() + rebase(**vars(args))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff3_rebase.xml Thu Apr 20 08:12:49 2017 -0400 @@ -0,0 +1,93 @@ +<tool id="gff3.rebase" name="Rebase GFF3 features" version="1.2"> + <description>against parent features</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements"/> + <version_command>python gff3_rebase.py --version</version_command> + <command detect_errors="aggressive"><![CDATA[ + python '$__tool_directory__/gff3_rebase.py' + '$parent' + '$child' + + $interpro + $protein2dna + > '$output']]> + </command> + <inputs> + <param label="Parent GFF3 annotations" name="parent" format="gff3" type="data"/> + <param label="Child GFF3 annotations to rebase against parent" name="child" format="gff3" type="data"/> + + <param label="Interpro specific modifications" name="interpro" type="boolean" truevalue="--interpro" falsevalue=""/> + <param label="Map protein translated results to original DNA data" name="protein2dna" type="boolean" truevalue="--protein2dna" falsevalue=""/> + </inputs> + <outputs> + <data format="gff3" name="output"/> + </outputs> + <tests> + <test> + <param name="parent" value="parent.gff"/> + <param name="child" value="child.gff"/> + <param name="protein2dna" value="True" /> + <output name="output" file="proteins.gff"/> + </test> + </tests> + <help><![CDATA[ +**What it does** + +Often the genomic data processing/analysis process requires a workflow like the following: + +- select some features from a genome +- export the sequences associated with those regions +- analyse those exports with some tool like Blast + +For display, especially in software like JBrowse, it is convenient to know +where in the original genome the analysis results would fall. E.g. if a +transmembrane domain is detected at bases 10-20 of an analysed protein, where +should this be displayed relative to the parent genome? + +This tool helps fill that gap, by rebasing some analysis results against the +parent features which were originally analysed. + +**Example Inputs** + +For a "child" set of annotations like:: + + #gff-version 3 + cds42 blastp match_part 1 50 1e-40 . . ID=m00001;Notes=RNAse A Protein + +And a parent set of annotations like:: + + #gff-version 3 + PhageBob maker cds 300 600 . + . ID=cds42 + +One could imagine that during the analysis process, the user had exported the parent annotation into some sequence:: + + >cds42 + M...... + +and then analysed those results, producing the "child" annotation file. This +tool will then localize the results properly against the parent:: + + #gff-version 3 + PhageBob blastp match_part 300 449 1e-40 + . ID=m00001;Notes=RNAse A Protein + +which will allow you to display the results in the correct location in visualizations. + +**Options** + +There are two optional flags which can be passed. + +The Interpro flag selectively ignores features which shouldn't be included in +the output (i.e. ``polypeptide``), and a couple of qualifiers that aren't +useful (``status``, ``Target``) + +The "Map Protein..." flag says that you translated the sequences during the +genomic export process, analysing protein sequences. This indicates to the +software that the bases should be multiplied by three to obtain the correct DNA +locations. +]]></help> + <citations> + <citation type="doi">10.1093/bioinformatics/btp163</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Thu Apr 20 08:12:49 2017 -0400 @@ -0,0 +1,9 @@ +<?xml version="1.0"?> +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package" version="0.6.4">bcbiogff</requirement> + <yield/> + </requirements> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/child.gff Thu Apr 20 08:12:49 2017 -0400 @@ -0,0 +1,2 @@ +#gff-version 3 +cds42 blastp match_part 1 50 1e-40 . . ID=m00001;Notes=RNAse A Protein