annotate cpt_renumber_gbk/renumber.py @ 0:8cac332dbc77 draft default tip

Uploaded
author cpt
date Fri, 17 Jun 2022 13:13:47 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
1 #!/usr/bin/env python
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
2 import BIO_FIX_TOPO # NOQA
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
3 import argparse
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
4 import sys # noqa
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
5 from Bio import SeqIO
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
6
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
7 import logging
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
8
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
9 logging.basicConfig(level=logging.INFO)
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
10 log = logging.getLogger()
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
11
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
12 # gene and RBS features are also included in the tagged features list, but are dealt with specifically elsewhere.
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
13 # This is used to filter out just valid "in gene" features
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
14 TAGGED_FEATURES = ["CDS", "tRNA", "intron", "mat_peptide"]
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
15
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
16
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
17 def renumber_genes(
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
18 gbk_files,
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
19 tag_to_update="locus_tag",
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
20 string_prefix="display_id",
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
21 leading_zeros=3,
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
22 forceTagMatch=False,
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
23 change_table=None,
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
24 ):
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
25
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
26 for gbk_file in gbk_files:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
27 for record in SeqIO.parse(gbk_file, "genbank"):
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
28 if string_prefix == "display_id":
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
29 format_string = record.id + "_%0" + str(leading_zeros) + "d"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
30 else:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
31 format_string = string_prefix + "%0" + str(leading_zeros) + "d"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
32
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
33 # f_cds = [f for f in record.features if f.type == 'CDS']
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
34 # f_rbs = [f for f in record.features if f.type == 'RBS']
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
35 # f_gene = [f for f in record.features if f.type == 'gene']
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
36 # f_intron = [f for f in record.features if f.type == 'intron']
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
37 # f_trna = [f for f in record.features if f.type == 'tRNA']
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
38 # f_pep = [f for f in record.features if f.type == 'mat_peptide']
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
39 # f_oth = [f for f in record.features if f.type not in ['CDS', 'RBS',
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
40 # 'gene', 'intron',
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
41 # 'tRNA', 'mat_peptide']]
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
42 # Apparently we're numbering tRNAs now, thanks for telling me.
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
43 # f_oth2 = []
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
44 # for q in sorted(f_oth, key=lambda x: x.location.start):
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
45 # if q.type == 'tRNA':
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
46 # q.qualifiers['locus_tag'] = format_string_t % tRNA_count
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
47 # tRNA_count += 1
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
48 # f_oth2.append(q)
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
49 # else:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
50 # f_oth2.append(q)
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
51 # f_oth = f_oth2
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
52
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
53 # f_care_about = []
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
54
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
55 # Make sure we've hit every RBS and gene
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
56 # for cds in f_cds:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
57 # If there's an associated gene feature, it will share a stop codon
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
58 # if cds.location.strand > 0:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
59 # associated_genes = [f for f in f_gene if f.location.end ==
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
60 # cds.location.end]
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
61 # else:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
62 # associated_genes = [f for f in f_gene if f.location.start ==
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
63 # cds.location.start]
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
64
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
65 # # If there's an RBS it'll be upstream a bit.
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
66 # if cds.location.strand > 0:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
67 # associated_rbss = [f for f in f_rbs if f.location.end <
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
68 # cds.location.start and f.location.end >
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
69 # cds.location.start - 24]
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
70 # else:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
71 # associated_rbss = [f for f in f_rbs if f.location.start >
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
72 # cds.location.end and f.location.start <
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
73 # cds.location.end + 24]
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
74 # tmp_result = [cds]
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
75 # if len(associated_genes) > 0:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
76 # tmp_result.append(associated_genes[0])
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
77
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
78 # if len(associated_rbss) == 1:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
79 # tmp_result.append(associated_rbss[0])
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
80 # else:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
81 # log.warning("%s RBSs found for %s", len(associated_rbss), cds.location)
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
82 # We choose to append to f_other as that has all features not
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
83 # already accessed. It may mean that some gene/RBS features are
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
84 # missed if they aren't detected here, which we'll need to handle.
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
85 # f_care_about.append(tmp_result)
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
86
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
87 #####-----------------------------------------------------------------------------------------------------------#####
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
88 # Build list of genes, then iterate over non-gene features and sort into containing genes.
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
89 # tags are assigned based on genes, so start the lists with the gene features
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
90 f_gene = sorted(
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
91 [f for f in record.features if f.type == "gene"],
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
92 key=lambda x: x.location.start,
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
93 )
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
94 oldNames = []
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
95 for x in f_gene:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
96 if tag_to_update in x.qualifiers.keys():
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
97 oldNames.append(x.qualifiers[tag_to_update])
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
98
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
99 f_rbs = sorted(
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
100 [f for f in record.features if f.type == "RBS"],
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
101 key=lambda x: x.location.start,
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
102 )
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
103 f_tag = list()
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
104 f_sorted = sorted(
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
105 [f for f in record.features if f.type in TAGGED_FEATURES],
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
106 key=lambda x: x.location.start,
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
107 )
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
108 # genes not in the TAGGED_FEATURES list are exluded from the processing and assumed to already be clean
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
109 clean_features = sorted(
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
110 [
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
111 f
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
112 for f in record.features
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
113 if f.type not in TAGGED_FEATURES and f.type not in ["gene", "RBS"]
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
114 ],
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
115 key=lambda x: x.location.start,
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
116 )
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
117
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
118 f_processed = []
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
119 for gene in f_gene:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
120 tag = [gene]
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
121 if gene.location.strand >= 0: # Be strict on where to find starting RBS
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
122 geneComp = gene.location.start
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
123 else:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
124 geneComp = gene.location.end
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
125 # find the gene's RBS feature
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
126 for rbs in [f for f in f_rbs if f not in f_processed]:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
127 if is_within(rbs, gene) and (
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
128 rbs.location.start == geneComp or rbs.location.end == geneComp
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
129 ):
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
130 if (tag_to_update not in rbs.qualifiers.keys()):
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
131 tag.append(rbs)
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
132 f_processed.append(rbs)
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
133 break
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
134 elif (tag_to_update not in gene.qualifiers.keys()): # This will gurantee qual is in gene and RBS for next check
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
135 tag.append(rbs)
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
136 f_processed.append(rbs)
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
137 break
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
138 elif (not forceTagMatch) or (rbs.qualifiers[tag_to_update] == gene.qualifiers[tag_to_update]):
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
139 tag.append(rbs)
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
140 f_processed.append(rbs)
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
141 break
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
142
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
143 # find all other non-RBS features
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
144 for feature in [f for f in f_sorted if f not in f_processed]:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
145 # If the feature is within the gene boundaries (genes are the first entry in tag list),
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
146 # add it to the same locus tag group, does not process RBS
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
147 if is_within(feature, gene):
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
148 if tag_to_update not in feature.qualifiers.keys():
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
149 # catches genes and CDS feature that are intron-contained.
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
150 if feature.type == "CDS":
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
151 if (
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
152 feature.location.start == gene.location.start
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
153 or feature.location.end == gene.location.end
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
154 ):
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
155
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
156 tag.append(feature)
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
157 f_processed.append(feature)
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
158 else:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
159 tag.append(feature)
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
160 f_processed.append(feature)
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
161 elif (not forceTagMatch) or (tag_to_update in gene.qualifiers.keys() and feature.qualifiers[tag_to_update] == gene.qualifiers[tag_to_update]):
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
162 tag.append(feature)
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
163 f_processed.append(feature)
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
164 elif feature.location.start > gene.location.end:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
165 # because the features are sorted by coordinates,
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
166 # no features further down on the list will be in this gene
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
167 break
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
168 f_tag.append(tag)
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
169
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
170 # Process for frameshifts and mat_peptides (inteins)
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
171
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
172 # check for overlapped genes
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
173 # at this point, relevant features are put into tag buckets along with the containing gene
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
174 # matin the form of [gene, feature1, feature2, ...]
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
175 for rbs in [f for f in f_rbs if f not in f_processed]:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
176 dupeRBS = False
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
177 for x in f_processed:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
178 if x.type == "RBS" and (tag_to_update in rbs.qualifiers.keys() and tag_to_update in x.qualifiers.keys() and rbs.qualifiers[tag_to_update] == x.qualifiers[tag_to_update]):
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
179 dupeRBS = True
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
180 if dupeRBS:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
181 change_table.write(
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
182 record.id
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
183 + "\t"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
184 + rbs.type
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
185 + ":"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
186 + (rbs.qualifiers[tag_to_update][0])
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
187 + "\t[Removed: Parent gene already had an RBS]\n"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
188 )
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
189 else:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
190 change_table.write(
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
191 record.id
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
192 + "\t"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
193 + rbs.type
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
194 + ":"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
195 + (rbs.qualifiers[tag_to_update][0])
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
196 + "\t[Removed: RBS did not both fall within boundary of gene and share a boundary with a gene]\n"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
197 )
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
198
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
199
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
200 tag_index = 1
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
201 delta = []
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
202 for tag in f_tag: # each tag list is one 'bucket'
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
203 new_tag_value = format_string % tag_index
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
204 for feature in tag:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
205 original_tag_value = delta_old(feature, tag_to_update)
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
206 feature.qualifiers[tag_to_update] = [new_tag_value]
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
207 # Once the tag is renumbered, it's added to the clean list for later output
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
208 clean_features.append(feature)
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
209 delta.append(
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
210 "\t".join((record.id, original_tag_value, new_tag_value))
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
211 )
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
212 tag_index += 1
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
213
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
214 # Why must these people start at 1
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
215 # Because we have to modify the array we work on a separate one
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
216 # clean_features = f_oth
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
217 # delta = []
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
218 # for index, feature_list in enumerate(sorted(f_care_about, key=lambda x: x[0].location.start)):
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
219 # for f in feature_list:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
220 # original_tag_value = delta_old(f, tag_to_update)
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
221 # # Add 1 to index for 1-indexed counting for happy scientists
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
222 # new_tag_value = format_string % (index+1)
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
223 # f.qualifiers[tag_to_update] = [new_tag_value]
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
224 # clean_features.append(f)
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
225 # delta.append('\t'.join((record.id, original_tag_value, new_tag_value)))
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
226
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
227 # Update all features
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
228 record.features = sorted(clean_features, key=lambda x: x.location.start)
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
229
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
230 for feature in [f for f in f_sorted if f not in f_processed]:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
231 if feature.type == "CDS":
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
232 if tag_to_update in feature.qualifiers.keys() and forceTagMatch:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
233 failNameCheck = True
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
234 for x in oldNames:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
235 for tag in feature.qualifiers[tag_to_update]:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
236 if tag in x:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
237 failNameCheck = False
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
238 if not failNameCheck:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
239 break
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
240 if failNameCheck:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
241 change_table.write(
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
242 record.id
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
243 + "\t"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
244 + feature.type
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
245 + ":"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
246 + (feature.qualifiers[tag_to_update][0])
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
247 + "\t[Removed: (Tag check enabled) CDS did not both share a start/end with and fall within a gene with the same " + tag_to_update + " value]\n"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
248 )
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
249 else:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
250 change_table.write(
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
251 record.id
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
252 + "\t"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
253 + feature.type
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
254 + ":"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
255 + (feature.qualifiers[tag_to_update][0])
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
256 + "\t[Removed: CDS did not both fall within boundary of gene and share a boundary with a gene]\n"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
257 )
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
258 elif tag_to_update in feature.qualifiers.keys():
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
259 change_table.write(
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
260 record.id
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
261 + "\t"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
262 + feature.type
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
263 + ":"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
264 + (feature.qualifiers[tag_to_update][0])
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
265 + "\t[Removed: CDS did not both fall within boundary of gene and share a boundary with a gene]\n"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
266 )
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
267 else:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
268 change_table.write(
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
269 record.id
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
270 + "\t"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
271 + feature.type
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
272 + ": No "
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
273 + tag_to_update
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
274 + "\t[Removed: CDS at (" + str(feature.location.start) + "," + str(feature.location.end) + ") did not both fall within boundary of gene and share a boundary with a gene]\n"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
275 )
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
276 else:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
277 if tag_to_update in feature.qualifiers.keys() and forceTagMatch:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
278 failNameCheck = True
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
279 for x in oldNames:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
280 for tag in feature.qualifiers[tag_to_update]:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
281 if tag in x:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
282 failNameCheck = False
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
283 if not failNameCheck:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
284 break
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
285 if failNameCheck:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
286 change_table.write(
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
287 record.id
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
288 + "\t"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
289 + feature.type
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
290 + ":"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
291 + (feature.qualifiers[tag_to_update][0])
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
292 + "\t[Removed: (Tag check enabled) Feature did not fall within a gene it shared a " + tag_to_update + " value with]\n"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
293 )
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
294 else:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
295 change_table.write(
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
296 record.id
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
297 + "\t"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
298 + feature.type
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
299 + ":"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
300 + (feature.qualifiers[tag_to_update][0])
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
301 + "\t[Removed: Feature not within boundary of a gene]\n"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
302 )
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
303 elif tag_to_update in feature.qualifiers.keys():
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
304 change_table.write(
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
305 record.id
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
306 + "\t"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
307 + feature.type
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
308 + ":"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
309 + (feature.qualifiers[tag_to_update][0])
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
310 + "\t[Removed: Feature not within boundary of a gene]\n"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
311 )
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
312 else:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
313 change_table.write(
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
314 record.id
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
315 + "\t"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
316 + feature.type
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
317 + ": (has no "
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
318 + tag_to_update
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
319 + ")\t[Removed: Feature not within boundary of a gene]\n"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
320 )
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
321 change_table.write("\n".join(delta) + "\n")
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
322
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
323 # Output
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
324 yield record
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
325
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
326
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
327 def delta_old(feature, tag_to_update):
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
328 # First part of delta entry, old name
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
329 if tag_to_update in feature.qualifiers:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
330 return feature.qualifiers[tag_to_update][0]
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
331 else:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
332 return "%s %s %s" % (
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
333 feature.location.start,
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
334 feature.location.end,
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
335 feature.location.strand,
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
336 )
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
337
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
338
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
339 def is_within(query, feature):
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
340 # checks if the query item is within the bounds of the given feature
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
341 sortedList = sorted(query.location.parts, key=lambda x: x.start)
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
342 for x in sortedList:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
343 if (
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
344 feature.location.start <= x.start
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
345 and feature.location.end >= x.end
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
346 ):
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
347 if x.strand < 0 and x == sortedList[-1]:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
348 return True
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
349 elif x.strand >= 0 and x == sortedList[0]:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
350 return True
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
351 #else:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
352 return False
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
353
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
354
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
355 # def fix_frameshift(a, b):
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
356 # #checks if gene a and gene b are a frameshifted gene (either shares a start or an end and an RBS)
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
357 # if a[0].location.start == b[0].location.start or a[0].location.end == b[0].location.end:
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
358 # # It is likely a frameshift. Treat is as such. Find shared RBS, determine which CDS is which
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
359 # big_gene = a if (a[0].location.end - a[0].location.start) > (b[0].location.end - b[0].location.start) else b
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
360 # small_gene = a if big_gene==b else b
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
361 # rbs = [f for f in a if f.type == 'RBS']
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
362 # # In the way that the tag lists are generated, the larger gene should contain both CDS features.
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
363 # # Retrieve and dermine big/small CDS
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
364 # cdss = [f for f in big_gene if f.type == 'CDS']
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
365 # big_cds = cdss[0] if (cdss[0].location.end - cdss[0].location.start) > (cdss[1].location.end - cdss[1].location.start) else cdss[1]
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
366 # small_cds = cdss[0] if big_cds==cdss[1] else cdss[1]
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
367
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
368
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
369 if __name__ == "__main__":
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
370 parser = argparse.ArgumentParser(description="Renumber genbank files")
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
371 parser.add_argument(
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
372 "gbk_files", type=argparse.FileType("r"), nargs="+", help="Genbank files"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
373 )
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
374 parser.add_argument(
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
375 "--tag_to_update", type=str, help="Tag to update", default="locus_tag"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
376 )
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
377 parser.add_argument(
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
378 "--string_prefix", type=str, help="Prefix string", default="display_id"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
379 )
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
380 parser.add_argument(
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
381 "--leading_zeros", type=int, help="# of leading zeroes", default=3
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
382 )
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
383
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
384 parser.add_argument(
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
385 "--forceTagMatch", action="store_true", help="Make non-CDS features match tag initially"
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
386 )
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
387
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
388 parser.add_argument(
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
389 "--change_table",
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
390 type=argparse.FileType("w"),
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
391 help="Location to store change table in",
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
392 default="renumber.tsv",
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
393 )
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
394
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
395 args = parser.parse_args()
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
396 for record in renumber_genes(**vars(args)):
8cac332dbc77 Uploaded
cpt
parents:
diff changeset
397 SeqIO.write(record, sys.stdout, "genbank")