comparison gff3_rebase.py @ 2:238981ed43b7 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gff3_rebase commit 908f16ea4eb082227437dc93e06e8cb742f5a257
author iuc
date Wed, 15 Nov 2017 15:15:12 -0500
parents ea35a85b941d
children
comparison
equal deleted inserted replaced
1:ea35a85b941d 2:238981ed43b7
81 81
82 82
83 def __get_features(child, interpro=False): 83 def __get_features(child, interpro=False):
84 child_features = {} 84 child_features = {}
85 for rec in GFF.parse(child): 85 for rec in GFF.parse(child):
86 # Only top level
86 for feature in rec.features: 87 for feature in rec.features:
88 # Get the record id as parent_feature_id (since this is how it will be during remapping)
87 parent_feature_id = rec.id 89 parent_feature_id = rec.id
90 # If it's an interpro specific gff3 file
88 if interpro: 91 if interpro:
92 # Then we ignore polypeptide features as they're useless
89 if feature.type == 'polypeptide': 93 if feature.type == 'polypeptide':
90 continue 94 continue
91 if '_' in parent_feature_id: 95 # If there's an underscore, we strip up to that underscore?
92 parent_feature_id = parent_feature_id[parent_feature_id.index('_') + 1:] 96 # I do not know the rationale for this, removing.
97 # if '_' in parent_feature_id:
98 # parent_feature_id = parent_feature_id[parent_feature_id.index('_') + 1:]
93 99
94 try: 100 try:
95 child_features[parent_feature_id].append(feature) 101 child_features[parent_feature_id].append(feature)
96 except KeyError: 102 except KeyError:
97 child_features[parent_feature_id] = [feature] 103 child_features[parent_feature_id] = [feature]
104 # Keep a list of feature objects keyed by parent record id
98 return child_features 105 return child_features
99 106
100 107
101 def __update_feature_location(feature, parent, protein2dna): 108 def __update_feature_location(feature, parent, protein2dna):
102 start = feature.location.start 109 start = feature.location.start
130 if hasattr(feature, 'sub_features'): 137 if hasattr(feature, 'sub_features'):
131 for subfeature in feature.sub_features: 138 for subfeature in feature.sub_features:
132 __update_feature_location(subfeature, parent, protein2dna) 139 __update_feature_location(subfeature, parent, protein2dna)
133 140
134 141
135 def rebase(parent, child, interpro=False, protein2dna=False): 142 def rebase(parent, child, interpro=False, protein2dna=False, map_by='ID'):
143 # get all of the features we will be re-mapping in a dictionary, keyed by parent feature ID
136 child_features = __get_features(child, interpro=interpro) 144 child_features = __get_features(child, interpro=interpro)
137 145
138 for rec in GFF.parse(parent): 146 for rec in GFF.parse(parent):
139 replacement_features = [] 147 replacement_features = []
140 for feature in feature_lambda( 148 for feature in feature_lambda(
141 rec.features, 149 rec.features,
150 # Filter features in the parent genome by those that are
151 # "interesting", i.e. have results in child_features array.
152 # Probably an unnecessary optimisation.
142 feature_test_qual_value, 153 feature_test_qual_value,
143 { 154 {
144 'qualifier': 'ID', 155 'qualifier': map_by,
145 'attribute_list': child_features.keys(), 156 'attribute_list': child_features.keys(),
146 }, 157 },
147 subfeatures=False): 158 subfeatures=False):
148 159
149 new_subfeatures = child_features[feature.id] 160 # Features which will be re-mapped
150 fixed_subfeatures = [] 161 to_remap = child_features[feature.id]
151 for x in new_subfeatures: 162 # TODO: update starts
163 fixed_features = []
164 for x in to_remap:
152 # Then update the location of the actual feature 165 # Then update the location of the actual feature
153 __update_feature_location(x, feature, protein2dna) 166 __update_feature_location(x, feature, protein2dna)
154 167
155 if interpro: 168 if interpro:
156 for y in ('status', 'Target'): 169 for y in ('status', 'Target'):
157 try: 170 try:
158 del x.qualifiers[y] 171 del x.qualifiers[y]
159 except Exception: 172 except Exception:
160 pass 173 pass
161 174
162 fixed_subfeatures.append(x) 175 fixed_features.append(x)
163 replacement_features.extend(fixed_subfeatures) 176 replacement_features.extend(fixed_features)
164 # We do this so we don't include the original set of features that we 177 # We do this so we don't include the original set of features that we
165 # were rebasing against in our result. 178 # were rebasing against in our result.
166 rec.features = replacement_features 179 rec.features = replacement_features
167 rec.annotations = {} 180 rec.annotations = {}
168 GFF.write([rec], sys.stdout) 181 GFF.write([rec], sys.stdout)
174 parser.add_argument('child', type=argparse.FileType('r'), help='Child GFF3 annotations to rebase against parent') 187 parser.add_argument('child', type=argparse.FileType('r'), help='Child GFF3 annotations to rebase against parent')
175 parser.add_argument('--interpro', action='store_true', 188 parser.add_argument('--interpro', action='store_true',
176 help='Interpro specific modifications') 189 help='Interpro specific modifications')
177 parser.add_argument('--protein2dna', action='store_true', 190 parser.add_argument('--protein2dna', action='store_true',
178 help='Map protein translated results to original DNA data') 191 help='Map protein translated results to original DNA data')
192 parser.add_argument('--map_by', help='Map by key', default='ID')
179 args = parser.parse_args() 193 args = parser.parse_args()
180 rebase(**vars(args)) 194 rebase(**vars(args))