comparison gff3_rebase.py @ 98:b1260bca5fdc draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/jbrowse2 commit 44d8fc559ecf5463a8f753561976fa26686c96f6
author bgruening
date Wed, 05 Jun 2024 10:00:07 +0000
parents 39b717d934a8
children
comparison
equal deleted inserted replaced
97:74074746ccd8 98:b1260bca5fdc
61 feature_copy.sub_features = [] 61 feature_copy.sub_features = []
62 yield feature_copy 62 yield feature_copy
63 else: 63 else:
64 yield feature 64 yield feature
65 65
66 if hasattr(feature, "sub_features"): 66 if hasattr(feature, 'sub_features'):
67 for x in feature_lambda( 67 for x in feature_lambda(feature.sub_features, test, test_kwargs, subfeatures=subfeatures):
68 feature.sub_features,
69 test,
70 test_kwargs,
71 subfeatures=subfeatures,
72 ):
73 yield x 68 yield x
74 69
75 70
76 def feature_test_qual_value(feature, **kwargs): 71 def feature_test_qual_value(feature, **kwargs):
77 """Test qualifier values. 72 """Test qualifier values.
78 73
79 For every feature, check that at least one value in 74 For every feature, check that at least one value in
80 feature.quailfiers(kwargs['qualifier']) is in kwargs['attribute_list'] 75 feature.quailfiers(kwargs['qualifier']) is in kwargs['attribute_list']
81 """ 76 """
82 for attribute_value in feature.qualifiers.get(kwargs["qualifier"], []): 77 for attribute_value in feature.qualifiers.get(kwargs['qualifier'], []):
83 if attribute_value in kwargs["attribute_list"]: 78 if attribute_value in kwargs['attribute_list']:
84 return True 79 return True
85 return False 80 return False
86 81
87 82
88 def __get_features(child, interpro=False): 83 def __get_features(child, interpro=False):
93 # Get the record id as parent_feature_id (since this is how it will be during remapping) 88 # Get the record id as parent_feature_id (since this is how it will be during remapping)
94 parent_feature_id = rec.id 89 parent_feature_id = rec.id
95 # If it's an interpro specific gff3 file 90 # If it's an interpro specific gff3 file
96 if interpro: 91 if interpro:
97 # Then we ignore polypeptide features as they're useless 92 # Then we ignore polypeptide features as they're useless
98 if feature.type == "polypeptide": 93 if feature.type == 'polypeptide':
99 continue 94 continue
100 # If there's an underscore, we strip up to that underscore? 95 # If there's an underscore, we strip up to that underscore?
101 # I do not know the rationale for this, removing. 96 # I do not know the rationale for this, removing.
102 # if '_' in parent_feature_id: 97 # if '_' in parent_feature_id:
103 # parent_feature_id = parent_feature_id[parent_feature_id.index('_') + 1:] 98 # parent_feature_id = parent_feature_id[parent_feature_id.index('_') + 1:]
104 99
105 try: 100 try:
106 child_features[parent_feature_id].append(feature) 101 child_features[parent_feature_id].append(feature)
107 except KeyError: 102 except KeyError:
108 child_features[parent_feature_id] = [feature] 103 child_features[parent_feature_id] = [feature]
115 end = feature.location.end 110 end = feature.location.end
116 if protein2dna: 111 if protein2dna:
117 start *= 3 112 start *= 3
118 end *= 3 113 end *= 3
119 114
120 if parent.location.strand != None and parent.location.strand >= 0: 115 if parent.location.strand >= 0:
121 ns = parent.location.start + start 116 ns = parent.location.start + start
122 ne = parent.location.start + end 117 ne = parent.location.start + end
123 st = +1 118 st = +1
124 else: 119 else:
125 ns = parent.location.end - end 120 ns = parent.location.end - end
134 # frame that it should be in. 129 # frame that it should be in.
135 if ns < 0: 130 if ns < 0:
136 ns %= 3 131 ns %= 3
137 if ne < 0: 132 if ne < 0:
138 ne %= 3 133 ne %= 3
139 if ns > ne: 134
140 ne, ns = ns, ne # dunno why but sometimes happens
141 feature.location = FeatureLocation(ns, ne, strand=st) 135 feature.location = FeatureLocation(ns, ne, strand=st)
142 136
143 if hasattr(feature, "sub_features"): 137 if hasattr(feature, 'sub_features'):
144 for subfeature in feature.sub_features: 138 for subfeature in feature.sub_features:
145 __update_feature_location(subfeature, parent, protein2dna) 139 __update_feature_location(subfeature, parent, protein2dna)
146 140
147 141
148 def rebase(parent, child, interpro=False, protein2dna=False, map_by="ID"): 142 def rebase(parent, child, interpro=False, protein2dna=False, map_by='ID'):
149 # get all of the features we will be re-mapping in a dictionary, keyed by parent feature ID 143 # get all of the features we will be re-mapping in a dictionary, keyed by parent feature ID
150 child_features = __get_features(child, interpro=interpro) 144 child_features = __get_features(child, interpro=interpro)
151 145
152 for rec in GFF.parse(parent): 146 for rec in GFF.parse(parent):
153 replacement_features = [] 147 replacement_features = []
154 for feature in feature_lambda( 148 for feature in feature_lambda(
155 rec.features, 149 rec.features,
156 # Filter features in the parent genome by those that are 150 # Filter features in the parent genome by those that are
157 # "interesting", i.e. have results in child_features array. 151 # "interesting", i.e. have results in child_features array.
158 # Probably an unnecessary optimisation. 152 # Probably an unnecessary optimisation.
159 feature_test_qual_value, 153 feature_test_qual_value,
160 { 154 {
161 "qualifier": map_by, 155 'qualifier': map_by,
162 "attribute_list": child_features.keys(), 156 'attribute_list': child_features.keys(),
163 }, 157 },
164 subfeatures=False, 158 subfeatures=False):
165 ):
166 159
167 # Features which will be re-mapped 160 # Features which will be re-mapped
168 to_remap = child_features[feature.id] 161 to_remap = child_features[feature.id]
169 # TODO: update starts 162 # TODO: update starts
170 fixed_features = [] 163 fixed_features = []
171 for x in to_remap: 164 for x in to_remap:
172 # Then update the location of the actual feature 165 # Then update the location of the actual feature
173 __update_feature_location(x, feature, protein2dna) 166 __update_feature_location(x, feature, protein2dna)
174 167
175 if interpro: 168 if interpro:
176 for y in ("status", "Target"): 169 for y in ('status', 'Target'):
177 try: 170 try:
178 del x.qualifiers[y] 171 del x.qualifiers[y]
179 except Exception: 172 except Exception:
180 pass 173 pass
181 174
186 rec.features = replacement_features 179 rec.features = replacement_features
187 rec.annotations = {} 180 rec.annotations = {}
188 GFF.write([rec], sys.stdout) 181 GFF.write([rec], sys.stdout)
189 182
190 183
191 if __name__ == "__main__": 184 if __name__ == '__main__':
192 parser = argparse.ArgumentParser( 185 parser = argparse.ArgumentParser(description='rebase gff3 features against parent locations', epilog="")
193 description="rebase gff3 features against parent locations", epilog="" 186 parser.add_argument('parent', type=argparse.FileType('r'), help='Parent GFF3 annotations')
194 ) 187 parser.add_argument('child', type=argparse.FileType('r'), help='Child GFF3 annotations to rebase against parent')
195 parser.add_argument( 188 parser.add_argument('--interpro', action='store_true',
196 "parent", type=argparse.FileType("r"), help="Parent GFF3 annotations" 189 help='Interpro specific modifications')
197 ) 190 parser.add_argument('--protein2dna', action='store_true',
198 parser.add_argument( 191 help='Map protein translated results to original DNA data')
199 "child", 192 parser.add_argument('--map_by', help='Map by key', default='ID')
200 type=argparse.FileType("r"),
201 help="Child GFF3 annotations to rebase against parent",
202 )
203 parser.add_argument(
204 "--interpro",
205 action="store_true",
206 help="Interpro specific modifications",
207 )
208 parser.add_argument(
209 "--protein2dna",
210 action="store_true",
211 help="Map protein translated results to original DNA data",
212 )
213 parser.add_argument("--map_by", help="Map by key", default="ID")
214 args = parser.parse_args() 193 args = parser.parse_args()
215 rebase(**vars(args)) 194 rebase(**vars(args))