Mercurial > repos > iuc > jbrowse
view gff3_rebase.py @ 2:b6a0e126dbee draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/jbrowse commit a9cedb0983cc262cfdac04464566fdded884318b
author | iuc |
---|---|
date | Tue, 23 Jun 2015 12:10:15 -0400 |
parents | 497c6bb3b717 |
children | 7342f467507b |
line wrap: on
line source
#!/usr/bin/env python import sys import logging logging.basicConfig(level=logging.INFO) import argparse from BCBio import GFF from Bio.SeqFeature import FeatureLocation log = logging.getLogger(__name__) __author__ = "Eric Rasche" __version__ = "0.4.0" __maintainer__ = "Eric Rasche" __email__ = "esr@tamu.edu" def __get_features(child, interpro=False): child_features = {} for rec in GFF.parse(child): for feature in rec.features: parent_feature_id = rec.id if interpro: if feature.type == 'polypeptide': continue if '_' in parent_feature_id: parent_feature_id = parent_feature_id[parent_feature_id.index('_') + 1:] try: child_features[parent_feature_id].append(feature) except KeyError: child_features[parent_feature_id] = [feature] return child_features def __update_feature_location(feature, parent, protein2dna): start = feature.location.start end = feature.location.end if protein2dna: start *= 3 end *= 3 if parent.location.strand >= 0: ns = parent.location.start + start ne = parent.location.start + end st = +1 else: ns = parent.location.end - end ne = parent.location.end - start st = -1 # Don't let start/stops be less than zero. It's technically valid for them # to be (at least in the model I'm working with) but it causes numerous # issues. # # Instead, we'll replace with %3 to try and keep it in the same reading # frame that it should be in. if ns < 0: ns %= 3 if ne < 0: ne %= 3 feature.location = FeatureLocation(ns, ne, strand=st) if hasattr(feature, 'sub_features'): for subfeature in feature.sub_features: __update_feature_location(subfeature, parent, protein2dna) def rebase(parent, child, interpro=False, protein2dna=False): child_features = __get_features(child, interpro=interpro) for rec in GFF.parse(parent): # TODO, replace with recursion in case it's matched against a # non-parent feature. We're cheating a bit here right now... replacement_features = [] for feature in rec.features: if feature.id in child_features: new_subfeatures = child_features[feature.id] # TODO: update starts fixed_subfeatures = [] for x in new_subfeatures: # Then update the location of the actual feature __update_feature_location(x, feature, protein2dna) if interpro: for y in ('status', 'Target'): try: del x.qualifiers[y] except: pass fixed_subfeatures.append(x) replacement_features.extend(fixed_subfeatures) # We do this so we don't include the original set of features that we # were rebasing against in our result. rec.features = replacement_features GFF.write([rec], sys.stdout) if __name__ == '__main__': parser = argparse.ArgumentParser(description='rebase gff3 features against parent locations', epilog="") parser.add_argument('parent', type=file, help='Parent GFF3 annotations') parser.add_argument('child', help='Child GFF3 annotations to rebase against parent') parser.add_argument('--interpro', action='store_true', help='Interpro specific modifications') parser.add_argument('--protein2dna', action='store_true', help='Map protein translated results to original DNA data') args = parser.parse_args() rebase(**vars(args))