Mercurial > repos > cpt > cpt_gff_rebase
comparison gff3.py @ 1:4f4b413056f6 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
| author | cpt | 
|---|---|
| date | Mon, 05 Jun 2023 02:44:12 +0000 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 0:6e7e20cb1fc7 | 1:4f4b413056f6 | 
|---|---|
| 1 import copy | |
| 2 import logging | |
| 3 | |
| 4 log = logging.getLogger() | |
| 5 log.setLevel(logging.WARN) | |
| 6 | |
| 7 | |
| 8 def feature_lambda( | |
| 9 feature_list, | |
| 10 test, | |
| 11 test_kwargs, | |
| 12 subfeatures=True, | |
| 13 parent=None, | |
| 14 invert=False, | |
| 15 recurse=True, | |
| 16 ): | |
| 17 """Recursively search through features, testing each with a test function, yielding matches. | |
| 18 | |
| 19 GFF3 is a hierachical data structure, so we need to be able to recursively | |
| 20 search through features. E.g. if you're looking for a feature with | |
| 21 ID='bob.42', you can't just do a simple list comprehension with a test | |
| 22 case. You don't know how deeply burried bob.42 will be in the feature tree. This is where feature_lambda steps in. | |
| 23 | |
| 24 :type feature_list: list | |
| 25 :param feature_list: an iterable of features | |
| 26 | |
| 27 :type test: function reference | |
| 28 :param test: a closure with the method signature (feature, **kwargs) where | |
| 29 the kwargs are those passed in the next argument. This | |
| 30 function should return True or False, True if the feature is | |
| 31 to be yielded as part of the main feature_lambda function, or | |
| 32 False if it is to be ignored. This function CAN mutate the | |
| 33 features passed to it (think "apply"). | |
| 34 | |
| 35 :type test_kwargs: dictionary | |
| 36 :param test_kwargs: kwargs to pass to your closure when it is called. | |
| 37 | |
| 38 :type subfeatures: boolean | |
| 39 :param subfeatures: when a feature is matched, should just that feature be | |
| 40 yielded to the caller, or should the entire sub_feature | |
| 41 tree for that feature be included? subfeatures=True is | |
| 42 useful in cases such as searching for a gene feature, | |
| 43 and wanting to know what RBS/Shine_Dalgarno_sequences | |
| 44 are in the sub_feature tree (which can be accomplished | |
| 45 with two feature_lambda calls). subfeatures=False is | |
| 46 useful in cases when you want to process (and possibly | |
| 47 return) the entire feature tree, such as applying a | |
| 48 qualifier to every single feature. | |
| 49 | |
| 50 :type invert: boolean | |
| 51 :param invert: Negate/invert the result of the filter. | |
| 52 | |
| 53 :rtype: yielded list | |
| 54 :return: Yields a list of matching features. | |
| 55 """ | |
| 56 # Either the top level set of [features] or the subfeature attribute | |
| 57 for feature in feature_list: | |
| 58 feature._parent = parent | |
| 59 if not parent: | |
| 60 # Set to self so we cannot go above root. | |
| 61 feature._parent = feature | |
| 62 test_result = test(feature, **test_kwargs) | |
| 63 # if (not invert and test_result) or (invert and not test_result): | |
| 64 if invert ^ test_result: | |
| 65 if not subfeatures: | |
| 66 feature_copy = copy.deepcopy(feature) | |
| 67 feature_copy.sub_features = list() | |
| 68 yield feature_copy | |
| 69 else: | |
| 70 yield feature | |
| 71 | |
| 72 if recurse and hasattr(feature, "sub_features"): | |
| 73 for x in feature_lambda( | |
| 74 feature.sub_features, | |
| 75 test, | |
| 76 test_kwargs, | |
| 77 subfeatures=subfeatures, | |
| 78 parent=feature, | |
| 79 invert=invert, | |
| 80 recurse=recurse, | |
| 81 ): | |
| 82 yield x | |
| 83 | |
| 84 | |
| 85 def fetchParent(feature): | |
| 86 if not hasattr(feature, "_parent") or feature._parent is None: | |
| 87 return feature | |
| 88 else: | |
| 89 return fetchParent(feature._parent) | |
| 90 | |
| 91 | |
| 92 def feature_test_true(feature, **kwargs): | |
| 93 return True | |
| 94 | |
| 95 | |
| 96 def feature_test_type(feature, **kwargs): | |
| 97 if "type" in kwargs: | |
| 98 return str(feature.type).upper() == str(kwargs["type"]).upper() | |
| 99 elif "types" in kwargs: | |
| 100 for x in kwargs["types"]: | |
| 101 if str(feature.type).upper() == str(x).upper(): | |
| 102 return True | |
| 103 return False | |
| 104 raise Exception("Incorrect feature_test_type call, need type or types") | |
| 105 | |
| 106 | |
| 107 def feature_test_qual_value(feature, **kwargs): | |
| 108 """Test qualifier values. | |
| 109 | |
| 110 For every feature, check that at least one value in | |
| 111 feature.quailfiers(kwargs['qualifier']) is in kwargs['attribute_list'] | |
| 112 """ | |
| 113 if isinstance(kwargs["qualifier"], list): | |
| 114 for qualifier in kwargs["qualifier"]: | |
| 115 for attribute_value in feature.qualifiers.get(qualifier, []): | |
| 116 if attribute_value in kwargs["attribute_list"]: | |
| 117 return True | |
| 118 else: | |
| 119 for attribute_value in feature.qualifiers.get(kwargs["qualifier"], []): | |
| 120 if attribute_value in kwargs["attribute_list"]: | |
| 121 return True | |
| 122 return False | |
| 123 | |
| 124 | |
| 125 def feature_test_location(feature, **kwargs): | |
| 126 if "strand" in kwargs: | |
| 127 if feature.location.strand != kwargs["strand"]: | |
| 128 return False | |
| 129 | |
| 130 return feature.location.start <= kwargs["loc"] <= feature.location.end | |
| 131 | |
| 132 | |
| 133 def feature_test_quals(feature, **kwargs): | |
| 134 """ | |
| 135 Example:: | |
| 136 | |
| 137 a = Feature(qualifiers={'Note': ['Some notes', 'Aasdf']}) | |
| 138 | |
| 139 # Check if a contains a Note | |
| 140 feature_test_quals(a, {'Note': None}) # Returns True | |
| 141 feature_test_quals(a, {'Product': None}) # Returns False | |
| 142 | |
| 143 # Check if a contains a note with specific value | |
| 144 feature_test_quals(a, {'Note': ['ome']}) # Returns True | |
| 145 | |
| 146 # Check if a contains a note with specific value | |
| 147 feature_test_quals(a, {'Note': ['other']}) # Returns False | |
| 148 """ | |
| 149 for key in kwargs: | |
| 150 if key not in feature.qualifiers: | |
| 151 return False | |
| 152 | |
| 153 # Key is present, no value specified | |
| 154 if kwargs[key] is None: | |
| 155 return True | |
| 156 | |
| 157 # Otherwise there is a key value we're looking for. | |
| 158 # so we make a list of matches | |
| 159 matches = [] | |
| 160 # And check all of the feature qualifier valuse | |
| 161 for value in feature.qualifiers[key]: | |
| 162 # For that kwargs[key] value | |
| 163 for x in kwargs[key]: | |
| 164 matches.append(x in value) | |
| 165 | |
| 166 # If none matched, then we return false. | |
| 167 if not any(matches): | |
| 168 return False | |
| 169 | |
| 170 return True | |
| 171 | |
| 172 | |
| 173 def feature_test_contains(feature, **kwargs): | |
| 174 if "index" in kwargs: | |
| 175 return feature.location.start < kwargs["index"] < feature.location.end | |
| 176 elif "range" in kwargs: | |
| 177 return ( | |
| 178 feature.location.start < kwargs["range"]["start"] < feature.location.end | |
| 179 and feature.location.start < kwargs["range"]["end"] < feature.location.end | |
| 180 ) | |
| 181 else: | |
| 182 raise RuntimeError("Must use index or range keyword") | |
| 183 | |
| 184 | |
| 185 def get_id(feature=None, parent_prefix=None): | |
| 186 result = "" | |
| 187 if parent_prefix is not None: | |
| 188 result += parent_prefix + "|" | |
| 189 if "locus_tag" in feature.qualifiers: | |
| 190 result += feature.qualifiers["locus_tag"][0] | |
| 191 elif "gene" in feature.qualifiers: | |
| 192 result += feature.qualifiers["gene"][0] | |
| 193 elif "Gene" in feature.qualifiers: | |
| 194 result += feature.qualifiers["Gene"][0] | |
| 195 elif "product" in feature.qualifiers: | |
| 196 result += feature.qualifiers["product"][0] | |
| 197 elif "Product" in feature.qualifiers: | |
| 198 result += feature.qualifiers["Product"][0] | |
| 199 elif "Name" in feature.qualifiers: | |
| 200 result += feature.qualifiers["Name"][0] | |
| 201 else: | |
| 202 return feature.id | |
| 203 # Leaving in case bad things happen. | |
| 204 # result += '%s_%s_%s_%s' % ( | |
| 205 # feature.id, | |
| 206 # feature.location.start, | |
| 207 # feature.location.end, | |
| 208 # feature.location.strand | |
| 209 # ) | |
| 210 return result | |
| 211 | |
| 212 | |
| 213 def get_gff3_id(gene): | |
| 214 return gene.qualifiers.get("Name", [gene.id])[0] | |
| 215 | |
| 216 | |
| 217 def ensure_location_in_bounds(start=0, end=0, parent_length=0): | |
| 218 # This prevents frameshift errors | |
| 219 while start < 0: | |
| 220 start += 3 | |
| 221 while end < 0: | |
| 222 end += 3 | |
| 223 while start > parent_length: | |
| 224 start -= 3 | |
| 225 while end > parent_length: | |
| 226 end -= 3 | |
| 227 return (start, end) | |
| 228 | |
| 229 | |
| 230 def coding_genes(feature_list): | |
| 231 for x in genes(feature_list): | |
| 232 if ( | |
| 233 len( | |
| 234 list( | |
| 235 feature_lambda( | |
| 236 x.sub_features, | |
| 237 feature_test_type, | |
| 238 {"type": "CDS"}, | |
| 239 subfeatures=False, | |
| 240 ) | |
| 241 ) | |
| 242 ) | |
| 243 > 0 | |
| 244 ): | |
| 245 yield x | |
| 246 | |
| 247 | |
| 248 def genes(feature_list, feature_type="gene", sort=False): | |
| 249 """ | |
| 250 Simple filter to extract gene features from the feature set. | |
| 251 """ | |
| 252 | |
| 253 if not sort: | |
| 254 for x in feature_lambda( | |
| 255 feature_list, feature_test_type, {"type": feature_type}, subfeatures=True | |
| 256 ): | |
| 257 yield x | |
| 258 else: | |
| 259 data = list(genes(feature_list, feature_type=feature_type, sort=False)) | |
| 260 data = sorted(data, key=lambda feature: feature.location.start) | |
| 261 for x in data: | |
| 262 yield x | |
| 263 | |
| 264 | |
| 265 def wa_unified_product_name(feature): | |
| 266 """ | |
| 267 Try and figure out a name. We gave conflicting instructions, so | |
| 268 this isn't as trivial as it should be. Sometimes it will be in | |
| 269 'product' or 'Product', othertimes in 'Name' | |
| 270 """ | |
| 271 # Manually applied tags. | |
| 272 protein_product = feature.qualifiers.get( | |
| 273 "product", feature.qualifiers.get("Product", [None]) | |
| 274 )[0] | |
| 275 | |
| 276 # If neither of those are available ... | |
| 277 if protein_product is None: | |
| 278 # And there's a name... | |
| 279 if "Name" in feature.qualifiers: | |
| 280 if not is_uuid(feature.qualifiers["Name"][0]): | |
| 281 protein_product = feature.qualifiers["Name"][0] | |
| 282 | |
| 283 return protein_product | |
| 284 | |
| 285 | |
| 286 def is_uuid(name): | |
| 287 return name.count("-") == 4 and len(name) == 36 | |
| 288 | |
| 289 | |
| 290 def get_rbs_from(gene): | |
| 291 # Normal RBS annotation types | |
| 292 rbs_rbs = list( | |
| 293 feature_lambda( | |
| 294 gene.sub_features, feature_test_type, {"type": "RBS"}, subfeatures=False | |
| 295 ) | |
| 296 ) | |
| 297 rbs_sds = list( | |
| 298 feature_lambda( | |
| 299 gene.sub_features, | |
| 300 feature_test_type, | |
| 301 {"type": "Shine_Dalgarno_sequence"}, | |
| 302 subfeatures=False, | |
| 303 ) | |
| 304 ) | |
| 305 # Fraking apollo | |
| 306 apollo_exons = list( | |
| 307 feature_lambda( | |
| 308 gene.sub_features, feature_test_type, {"type": "exon"}, subfeatures=False | |
| 309 ) | |
| 310 ) | |
| 311 apollo_exons = [x for x in apollo_exons if len(x) < 10] | |
| 312 # These are more NCBI's style | |
| 313 regulatory_elements = list( | |
| 314 feature_lambda( | |
| 315 gene.sub_features, | |
| 316 feature_test_type, | |
| 317 {"type": "regulatory"}, | |
| 318 subfeatures=False, | |
| 319 ) | |
| 320 ) | |
| 321 rbs_regulatory = list( | |
| 322 feature_lambda( | |
| 323 regulatory_elements, | |
| 324 feature_test_quals, | |
| 325 {"regulatory_class": ["ribosome_binding_site"]}, | |
| 326 subfeatures=False, | |
| 327 ) | |
| 328 ) | |
| 329 # Here's hoping you find just one ;) | |
| 330 return rbs_rbs + rbs_sds + rbs_regulatory + apollo_exons | |
| 331 | |
| 332 | |
| 333 def nice_name(record): | |
| 334 """ | |
| 335 get the real name rather than NCBI IDs and so on. If fails, will return record.id | |
| 336 """ | |
| 337 name = record.id | |
| 338 likely_parental_contig = list(genes(record.features, feature_type="contig")) | |
| 339 if len(likely_parental_contig) == 1: | |
| 340 name = likely_parental_contig[0].qualifiers.get("organism", [name])[0] | |
| 341 return name | |
| 342 | |
| 343 | |
| 344 def fsort(it): | |
| 345 for i in sorted(it, key=lambda x: int(x.location.start)): | |
| 346 yield i | 
