comparison cpt_renumber_gbk/renumber.py @ 0:8cac332dbc77 draft default tip

Uploaded
author cpt
date Fri, 17 Jun 2022 13:13:47 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:8cac332dbc77
1 #!/usr/bin/env python
2 import BIO_FIX_TOPO # NOQA
3 import argparse
4 import sys # noqa
5 from Bio import SeqIO
6
7 import logging
8
9 logging.basicConfig(level=logging.INFO)
10 log = logging.getLogger()
11
12 # gene and RBS features are also included in the tagged features list, but are dealt with specifically elsewhere.
13 # This is used to filter out just valid "in gene" features
14 TAGGED_FEATURES = ["CDS", "tRNA", "intron", "mat_peptide"]
15
16
17 def renumber_genes(
18 gbk_files,
19 tag_to_update="locus_tag",
20 string_prefix="display_id",
21 leading_zeros=3,
22 forceTagMatch=False,
23 change_table=None,
24 ):
25
26 for gbk_file in gbk_files:
27 for record in SeqIO.parse(gbk_file, "genbank"):
28 if string_prefix == "display_id":
29 format_string = record.id + "_%0" + str(leading_zeros) + "d"
30 else:
31 format_string = string_prefix + "%0" + str(leading_zeros) + "d"
32
33 # f_cds = [f for f in record.features if f.type == 'CDS']
34 # f_rbs = [f for f in record.features if f.type == 'RBS']
35 # f_gene = [f for f in record.features if f.type == 'gene']
36 # f_intron = [f for f in record.features if f.type == 'intron']
37 # f_trna = [f for f in record.features if f.type == 'tRNA']
38 # f_pep = [f for f in record.features if f.type == 'mat_peptide']
39 # f_oth = [f for f in record.features if f.type not in ['CDS', 'RBS',
40 # 'gene', 'intron',
41 # 'tRNA', 'mat_peptide']]
42 # Apparently we're numbering tRNAs now, thanks for telling me.
43 # f_oth2 = []
44 # for q in sorted(f_oth, key=lambda x: x.location.start):
45 # if q.type == 'tRNA':
46 # q.qualifiers['locus_tag'] = format_string_t % tRNA_count
47 # tRNA_count += 1
48 # f_oth2.append(q)
49 # else:
50 # f_oth2.append(q)
51 # f_oth = f_oth2
52
53 # f_care_about = []
54
55 # Make sure we've hit every RBS and gene
56 # for cds in f_cds:
57 # If there's an associated gene feature, it will share a stop codon
58 # if cds.location.strand > 0:
59 # associated_genes = [f for f in f_gene if f.location.end ==
60 # cds.location.end]
61 # else:
62 # associated_genes = [f for f in f_gene if f.location.start ==
63 # cds.location.start]
64
65 # # If there's an RBS it'll be upstream a bit.
66 # if cds.location.strand > 0:
67 # associated_rbss = [f for f in f_rbs if f.location.end <
68 # cds.location.start and f.location.end >
69 # cds.location.start - 24]
70 # else:
71 # associated_rbss = [f for f in f_rbs if f.location.start >
72 # cds.location.end and f.location.start <
73 # cds.location.end + 24]
74 # tmp_result = [cds]
75 # if len(associated_genes) > 0:
76 # tmp_result.append(associated_genes[0])
77
78 # if len(associated_rbss) == 1:
79 # tmp_result.append(associated_rbss[0])
80 # else:
81 # log.warning("%s RBSs found for %s", len(associated_rbss), cds.location)
82 # We choose to append to f_other as that has all features not
83 # already accessed. It may mean that some gene/RBS features are
84 # missed if they aren't detected here, which we'll need to handle.
85 # f_care_about.append(tmp_result)
86
87 #####-----------------------------------------------------------------------------------------------------------#####
88 # Build list of genes, then iterate over non-gene features and sort into containing genes.
89 # tags are assigned based on genes, so start the lists with the gene features
90 f_gene = sorted(
91 [f for f in record.features if f.type == "gene"],
92 key=lambda x: x.location.start,
93 )
94 oldNames = []
95 for x in f_gene:
96 if tag_to_update in x.qualifiers.keys():
97 oldNames.append(x.qualifiers[tag_to_update])
98
99 f_rbs = sorted(
100 [f for f in record.features if f.type == "RBS"],
101 key=lambda x: x.location.start,
102 )
103 f_tag = list()
104 f_sorted = sorted(
105 [f for f in record.features if f.type in TAGGED_FEATURES],
106 key=lambda x: x.location.start,
107 )
108 # genes not in the TAGGED_FEATURES list are exluded from the processing and assumed to already be clean
109 clean_features = sorted(
110 [
111 f
112 for f in record.features
113 if f.type not in TAGGED_FEATURES and f.type not in ["gene", "RBS"]
114 ],
115 key=lambda x: x.location.start,
116 )
117
118 f_processed = []
119 for gene in f_gene:
120 tag = [gene]
121 if gene.location.strand >= 0: # Be strict on where to find starting RBS
122 geneComp = gene.location.start
123 else:
124 geneComp = gene.location.end
125 # find the gene's RBS feature
126 for rbs in [f for f in f_rbs if f not in f_processed]:
127 if is_within(rbs, gene) and (
128 rbs.location.start == geneComp or rbs.location.end == geneComp
129 ):
130 if (tag_to_update not in rbs.qualifiers.keys()):
131 tag.append(rbs)
132 f_processed.append(rbs)
133 break
134 elif (tag_to_update not in gene.qualifiers.keys()): # This will gurantee qual is in gene and RBS for next check
135 tag.append(rbs)
136 f_processed.append(rbs)
137 break
138 elif (not forceTagMatch) or (rbs.qualifiers[tag_to_update] == gene.qualifiers[tag_to_update]):
139 tag.append(rbs)
140 f_processed.append(rbs)
141 break
142
143 # find all other non-RBS features
144 for feature in [f for f in f_sorted if f not in f_processed]:
145 # If the feature is within the gene boundaries (genes are the first entry in tag list),
146 # add it to the same locus tag group, does not process RBS
147 if is_within(feature, gene):
148 if tag_to_update not in feature.qualifiers.keys():
149 # catches genes and CDS feature that are intron-contained.
150 if feature.type == "CDS":
151 if (
152 feature.location.start == gene.location.start
153 or feature.location.end == gene.location.end
154 ):
155
156 tag.append(feature)
157 f_processed.append(feature)
158 else:
159 tag.append(feature)
160 f_processed.append(feature)
161 elif (not forceTagMatch) or (tag_to_update in gene.qualifiers.keys() and feature.qualifiers[tag_to_update] == gene.qualifiers[tag_to_update]):
162 tag.append(feature)
163 f_processed.append(feature)
164 elif feature.location.start > gene.location.end:
165 # because the features are sorted by coordinates,
166 # no features further down on the list will be in this gene
167 break
168 f_tag.append(tag)
169
170 # Process for frameshifts and mat_peptides (inteins)
171
172 # check for overlapped genes
173 # at this point, relevant features are put into tag buckets along with the containing gene
174 # matin the form of [gene, feature1, feature2, ...]
175 for rbs in [f for f in f_rbs if f not in f_processed]:
176 dupeRBS = False
177 for x in f_processed:
178 if x.type == "RBS" and (tag_to_update in rbs.qualifiers.keys() and tag_to_update in x.qualifiers.keys() and rbs.qualifiers[tag_to_update] == x.qualifiers[tag_to_update]):
179 dupeRBS = True
180 if dupeRBS:
181 change_table.write(
182 record.id
183 + "\t"
184 + rbs.type
185 + ":"
186 + (rbs.qualifiers[tag_to_update][0])
187 + "\t[Removed: Parent gene already had an RBS]\n"
188 )
189 else:
190 change_table.write(
191 record.id
192 + "\t"
193 + rbs.type
194 + ":"
195 + (rbs.qualifiers[tag_to_update][0])
196 + "\t[Removed: RBS did not both fall within boundary of gene and share a boundary with a gene]\n"
197 )
198
199
200 tag_index = 1
201 delta = []
202 for tag in f_tag: # each tag list is one 'bucket'
203 new_tag_value = format_string % tag_index
204 for feature in tag:
205 original_tag_value = delta_old(feature, tag_to_update)
206 feature.qualifiers[tag_to_update] = [new_tag_value]
207 # Once the tag is renumbered, it's added to the clean list for later output
208 clean_features.append(feature)
209 delta.append(
210 "\t".join((record.id, original_tag_value, new_tag_value))
211 )
212 tag_index += 1
213
214 # Why must these people start at 1
215 # Because we have to modify the array we work on a separate one
216 # clean_features = f_oth
217 # delta = []
218 # for index, feature_list in enumerate(sorted(f_care_about, key=lambda x: x[0].location.start)):
219 # for f in feature_list:
220 # original_tag_value = delta_old(f, tag_to_update)
221 # # Add 1 to index for 1-indexed counting for happy scientists
222 # new_tag_value = format_string % (index+1)
223 # f.qualifiers[tag_to_update] = [new_tag_value]
224 # clean_features.append(f)
225 # delta.append('\t'.join((record.id, original_tag_value, new_tag_value)))
226
227 # Update all features
228 record.features = sorted(clean_features, key=lambda x: x.location.start)
229
230 for feature in [f for f in f_sorted if f not in f_processed]:
231 if feature.type == "CDS":
232 if tag_to_update in feature.qualifiers.keys() and forceTagMatch:
233 failNameCheck = True
234 for x in oldNames:
235 for tag in feature.qualifiers[tag_to_update]:
236 if tag in x:
237 failNameCheck = False
238 if not failNameCheck:
239 break
240 if failNameCheck:
241 change_table.write(
242 record.id
243 + "\t"
244 + feature.type
245 + ":"
246 + (feature.qualifiers[tag_to_update][0])
247 + "\t[Removed: (Tag check enabled) CDS did not both share a start/end with and fall within a gene with the same " + tag_to_update + " value]\n"
248 )
249 else:
250 change_table.write(
251 record.id
252 + "\t"
253 + feature.type
254 + ":"
255 + (feature.qualifiers[tag_to_update][0])
256 + "\t[Removed: CDS did not both fall within boundary of gene and share a boundary with a gene]\n"
257 )
258 elif tag_to_update in feature.qualifiers.keys():
259 change_table.write(
260 record.id
261 + "\t"
262 + feature.type
263 + ":"
264 + (feature.qualifiers[tag_to_update][0])
265 + "\t[Removed: CDS did not both fall within boundary of gene and share a boundary with a gene]\n"
266 )
267 else:
268 change_table.write(
269 record.id
270 + "\t"
271 + feature.type
272 + ": No "
273 + tag_to_update
274 + "\t[Removed: CDS at (" + str(feature.location.start) + "," + str(feature.location.end) + ") did not both fall within boundary of gene and share a boundary with a gene]\n"
275 )
276 else:
277 if tag_to_update in feature.qualifiers.keys() and forceTagMatch:
278 failNameCheck = True
279 for x in oldNames:
280 for tag in feature.qualifiers[tag_to_update]:
281 if tag in x:
282 failNameCheck = False
283 if not failNameCheck:
284 break
285 if failNameCheck:
286 change_table.write(
287 record.id
288 + "\t"
289 + feature.type
290 + ":"
291 + (feature.qualifiers[tag_to_update][0])
292 + "\t[Removed: (Tag check enabled) Feature did not fall within a gene it shared a " + tag_to_update + " value with]\n"
293 )
294 else:
295 change_table.write(
296 record.id
297 + "\t"
298 + feature.type
299 + ":"
300 + (feature.qualifiers[tag_to_update][0])
301 + "\t[Removed: Feature not within boundary of a gene]\n"
302 )
303 elif tag_to_update in feature.qualifiers.keys():
304 change_table.write(
305 record.id
306 + "\t"
307 + feature.type
308 + ":"
309 + (feature.qualifiers[tag_to_update][0])
310 + "\t[Removed: Feature not within boundary of a gene]\n"
311 )
312 else:
313 change_table.write(
314 record.id
315 + "\t"
316 + feature.type
317 + ": (has no "
318 + tag_to_update
319 + ")\t[Removed: Feature not within boundary of a gene]\n"
320 )
321 change_table.write("\n".join(delta) + "\n")
322
323 # Output
324 yield record
325
326
327 def delta_old(feature, tag_to_update):
328 # First part of delta entry, old name
329 if tag_to_update in feature.qualifiers:
330 return feature.qualifiers[tag_to_update][0]
331 else:
332 return "%s %s %s" % (
333 feature.location.start,
334 feature.location.end,
335 feature.location.strand,
336 )
337
338
339 def is_within(query, feature):
340 # checks if the query item is within the bounds of the given feature
341 sortedList = sorted(query.location.parts, key=lambda x: x.start)
342 for x in sortedList:
343 if (
344 feature.location.start <= x.start
345 and feature.location.end >= x.end
346 ):
347 if x.strand < 0 and x == sortedList[-1]:
348 return True
349 elif x.strand >= 0 and x == sortedList[0]:
350 return True
351 #else:
352 return False
353
354
355 # def fix_frameshift(a, b):
356 # #checks if gene a and gene b are a frameshifted gene (either shares a start or an end and an RBS)
357 # if a[0].location.start == b[0].location.start or a[0].location.end == b[0].location.end:
358 # # It is likely a frameshift. Treat is as such. Find shared RBS, determine which CDS is which
359 # big_gene = a if (a[0].location.end - a[0].location.start) > (b[0].location.end - b[0].location.start) else b
360 # small_gene = a if big_gene==b else b
361 # rbs = [f for f in a if f.type == 'RBS']
362 # # In the way that the tag lists are generated, the larger gene should contain both CDS features.
363 # # Retrieve and dermine big/small CDS
364 # cdss = [f for f in big_gene if f.type == 'CDS']
365 # big_cds = cdss[0] if (cdss[0].location.end - cdss[0].location.start) > (cdss[1].location.end - cdss[1].location.start) else cdss[1]
366 # small_cds = cdss[0] if big_cds==cdss[1] else cdss[1]
367
368
369 if __name__ == "__main__":
370 parser = argparse.ArgumentParser(description="Renumber genbank files")
371 parser.add_argument(
372 "gbk_files", type=argparse.FileType("r"), nargs="+", help="Genbank files"
373 )
374 parser.add_argument(
375 "--tag_to_update", type=str, help="Tag to update", default="locus_tag"
376 )
377 parser.add_argument(
378 "--string_prefix", type=str, help="Prefix string", default="display_id"
379 )
380 parser.add_argument(
381 "--leading_zeros", type=int, help="# of leading zeroes", default=3
382 )
383
384 parser.add_argument(
385 "--forceTagMatch", action="store_true", help="Make non-CDS features match tag initially"
386 )
387
388 parser.add_argument(
389 "--change_table",
390 type=argparse.FileType("w"),
391 help="Location to store change table in",
392 default="renumber.tsv",
393 )
394
395 args = parser.parse_args()
396 for record in renumber_genes(**vars(args)):
397 SeqIO.write(record, sys.stdout, "genbank")