Mercurial > repos > cpt > cpt_gbk_to_gff
comparison cpt_gbk_to_gff/gbk_to_gff3.py @ 0:a68f32350196 draft
Uploaded
author | cpt |
---|---|
date | Fri, 17 Jun 2022 12:46:43 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a68f32350196 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import argparse | |
4 import sys | |
5 | |
6 from Bio import SeqIO | |
7 from Bio.SeqRecord import SeqRecord | |
8 from Bio.SeqFeature import FeatureLocation | |
9 from CPT_GFFParser import gffSeqFeature, gffWrite | |
10 | |
11 bottomFeatTypes = ["exon", "RBS", "CDS"] | |
12 | |
13 def makeGffFeat(inFeat, num, recName, identifier): | |
14 if inFeat.type == "RBS" or (inFeat.type == "regulatory" and "regulatory_class" in inFeat.qualifiers.keys() and inFeat.qualifiers["regulatory_class"][0] == "ribosome_binding_site"): | |
15 inFeat.type = "Shine_Dalgarno_sequence" | |
16 if "codon_start" in inFeat.qualifiers.keys(): | |
17 shift = int(inFeat.qualifiers["codon_start"][0]) - 1 | |
18 else: | |
19 shift = "." | |
20 if identifier in inFeat.qualifiers.keys(): | |
21 name = inFeat.qualifiers[identifier][0] + "." + inFeat.type | |
22 if num > 0: | |
23 name += "." + str(num) | |
24 else: | |
25 name = recName + "." + inFeat.type + "." + str(num) | |
26 | |
27 outFeat = gffSeqFeature(inFeat.location, inFeat.type, '', inFeat.strand, name, inFeat.qualifiers, None, None, None, shift, 0, "GbkToGff") | |
28 outFeat.qualifiers["ID"] = [name] | |
29 return outFeat | |
30 | |
31 def main(inFile, makeMRNA, makeGene, identifier, fastaFile, outFile): | |
32 | |
33 ofh = sys.stdout | |
34 if outFile: | |
35 ofh = outFile | |
36 | |
37 outRec = [] | |
38 failed = 0 | |
39 for rec in SeqIO.parse(inFile, "genbank"): | |
40 recID = rec.name | |
41 | |
42 if len(str(rec.seq)) > 0: | |
43 seqs_pending_writes = True | |
44 outSeq = str(rec.seq) | |
45 seqLen = len(outSeq) | |
46 | |
47 locBucket = {} | |
48 outFeats = [] | |
49 topTypeDict = {} | |
50 seekingParent = [] | |
51 geneNum = 0 | |
52 autoGeneNum = 0 | |
53 for feat in rec.features: | |
54 if identifier not in feat.qualifiers.keys(): #Allow metadata features and other features with no ID (Output warning?) - AJC | |
55 if feat.type in bottomFeatTypes: | |
56 seekingParent.append([feat, [], []]) # [Feature, all parent candidates, strongest parent candidates] | |
57 continue | |
58 elif feat.type not in topTypeDict.keys(): | |
59 topTypeDict[feat.type] = 1 | |
60 else: | |
61 topTypeDict[feat.type] += 1 | |
62 outFeats.append(makeGffFeat(feat, topTypeDict[feat.type], recID, identifier)) | |
63 continue | |
64 elif feat.qualifiers[identifier][0] not in locBucket.keys(): | |
65 locBucket[feat.qualifiers[identifier][0]] = [] | |
66 locBucket[feat.qualifiers[identifier][0]].append(feat) | |
67 | |
68 for locus in locBucket.keys(): | |
69 minLoc = locBucket[locus][0].location.start | |
70 maxLoc = locBucket[locus][0].location.end | |
71 for feat in locBucket[locus]: | |
72 minLoc = min(minLoc, feat.location.start) | |
73 maxLoc = max(maxLoc, feat.location.end) | |
74 for x in seekingParent: | |
75 if x[0].location.start >= minLoc and x[0].location.end <= maxLoc: | |
76 x[1].append(locus) | |
77 if x[0].location.start == minLoc or x[0].location.end == maxLoc: | |
78 x[2].append(locus) | |
79 | |
80 for x in seekingParent: #Reformat to [Feature, Locus, Unused/Free] | |
81 if len(x[2]) == 1: | |
82 finList = "" | |
83 if len(x[1]) > 1: | |
84 for loc in x[1]: | |
85 if loc != x[2][0]: | |
86 finList += loc + ", " | |
87 finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived. Other, weaker candidate(s) were " + finList[0:-2] + "." | |
88 else: | |
89 finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived." | |
90 if "Notes" not in x[0].qualifiers.keys(): | |
91 x[0].qualifiers["Notes"] = [] | |
92 x[0].qualifiers["Notes"].append(finList) | |
93 x[1] = x[2][0] | |
94 elif len(x[2]) > 1: | |
95 candidate = x[2][0] #Arbitrarily choose first one | |
96 finList = "" | |
97 strongList = "" | |
98 for loc in x[2]: | |
99 if loc != candidate: | |
100 finList += loc + ", " | |
101 strongList += loc + ", " | |
102 for loc in x[1]: | |
103 if loc not in x[2]: | |
104 finList += loc + ", " | |
105 finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived. Other candidate(s) were " + finList[0:-2] + " (Equally strong candidate(s): " + strongList[0:-2] + ")." | |
106 if "Notes" not in x[0].qualifiers.keys(): | |
107 x[0].qualifiers["Notes"] = [] | |
108 x[0].qualifiers["Notes"].append(finList) | |
109 x[1] = candidate | |
110 elif len(x[1]) == 1: | |
111 x[1] = x[1][0] | |
112 if "Notes" not in x[0].qualifiers.keys(): | |
113 x[0].qualifiers["Notes"] = [] | |
114 finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived." | |
115 x[0].qualifiers["Notes"].append(finList) | |
116 elif len(x[1]) > 1: | |
117 candidate = x[1][0] #Arbitrarily choose first one | |
118 finList = "" | |
119 for loc in x[1]: | |
120 if loc != candidate: | |
121 finList += loc + ", " | |
122 finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived. Other candidates were " + finList[0:-2] + "." | |
123 if "Notes" not in x[0].qualifiers.keys(): | |
124 x[0].qualifiers["Notes"] = [] | |
125 x[0].qualifiers["Notes"].append(finList) | |
126 x[1] = candidate | |
127 else: | |
128 if makeGene: | |
129 sys.stderr.write("Warning: Unable to find potential parent for feature with no " + identifier + " of type " + str(x[0].type) + " at location [" + str(x[0].location.start + 1) + ", " + str(x[0].location.end) + "], creating standalone gene.\n") | |
130 autoGeneNum += 1 | |
131 x[0].source = "GbkToGff" | |
132 x[0].score = 0 | |
133 x[0].shift = 0 | |
134 if "ID" not in x[0].qualifiers.keys(): | |
135 x[0].qualifiers["ID"] = [recID + ".standalone_" + x[0].type + "." + str(autoGeneNum)] | |
136 tempName = recID + ".derived_Gene." + str(autoGeneNum) | |
137 tempQuals = {"ID" : [tempName], "Notes" : ["Gene feature automatically generated by Gbk to GFF conversion"]} | |
138 tempGene = gffSeqFeature(FeatureLocation(x[0].location.start, x[0].location.end, x[0].location.strand), 'gene', '', x[0].strand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff") | |
139 if makeMRNA: | |
140 tempName = recID + ".derived_mRNA." + str(autoGeneNum) | |
141 tempQuals = {"ID" : [tempName], "Notes" : ["mRNA feature automatically generated by Gbk to GFF conversion"]} | |
142 tempGene.sub_features.append(gffSeqFeature(FeatureLocation(x[0].location.start, x[0].location.end, x[0].location.strand), 'mRNA', '', x[0].strand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff")) | |
143 tempGene.sub_features[-1].sub_features.append(x[0]) | |
144 else: | |
145 tempGene.sub_features.append(x[0]) | |
146 | |
147 | |
148 outFeats.append(tempGene) | |
149 else: | |
150 sys.stderr.write("Warning: Unable to find potential parent for feature with no " + identifier + " of type " + str(x[0].type) + " at location [" + str(x[0].location.start + 1) + ", " + str(x[0].location.end) + "].\n") | |
151 if x[0].type not in topTypeDict.keys(): | |
152 topTypeDict[x[0].type] = 1 | |
153 else: | |
154 topTypeDict[x[0].type] += 1 | |
155 outFeats.append(makeGffFeat(x[0], topTypeDict[x[0].type], recID, identifier)) | |
156 | |
157 for locus in locBucket.keys(): | |
158 if len(locBucket[locus]) == 1: # No heirarchy to be made | |
159 outFeats.append(makeGffFeat(locBucket[locus][0], 0, recID, identifier)) | |
160 continue | |
161 topFeat = None | |
162 midFeat = None | |
163 bottomFeats = [] | |
164 typeDict = {} | |
165 minLoc = locBucket[locus][0].location.start | |
166 maxLoc = locBucket[locus][0].location.end | |
167 geneNum += 1 | |
168 for feat in locBucket[locus]: | |
169 # If we want to make our own top-level feat? | |
170 minLoc = min(minLoc, feat.location.start) | |
171 maxLoc = max(maxLoc, feat.location.end) | |
172 | |
173 # Gene->mRNA->CDS included as example, to add other feature-heirarchys in the appropriate slot | |
174 if feat.type in ['gene']: | |
175 if not topFeat: | |
176 topFeat = feat | |
177 # Else handle multiple top features | |
178 elif feat.type in ['mRNA', 'tRNA', 'rRNA']: | |
179 if not midFeat: | |
180 midFeat = feat | |
181 # Else handle multiple mid feats (May need another elif type-in-list statement if we actually expect a list of mid feats) | |
182 else: | |
183 if feat.type not in typeDict.keys(): | |
184 typeDict[feat.type] = 1 | |
185 else: | |
186 typeDict[feat.type] += 1 | |
187 bottomFeats.append(feat) | |
188 | |
189 for x in seekingParent: | |
190 if type(x[1]) != "list" and locus == x[1]: | |
191 x[0].qualifiers[identifier] = [locus] | |
192 bottomFeats.append(x[0]) | |
193 if x[0].type not in typeDict.keys(): | |
194 typeDict[x[0].type] = 1 | |
195 else: | |
196 typeDict[x[0].type] += 1 | |
197 | |
198 | |
199 | |
200 | |
201 | |
202 #if not topFeat: # Make our own top-level feature based off minLoc, maxLoc bounds | |
203 | |
204 for x in typeDict.keys(): # If only 1, set it to 0 so we don't append a number to the name | |
205 if typeDict[x] == 1: # Else, set to 1 so that we count up as we encounter the features | |
206 typeDict[x] = 0 | |
207 else: | |
208 typeDict[x] = 1 | |
209 | |
210 if not topFeat: | |
211 if makeGene: | |
212 if midFeat: | |
213 possibleStrand = midFeat.strand | |
214 else: | |
215 possibleStrand = bottomFeats[0].strand | |
216 tempName = recID + ".gene." + str(geneNum) | |
217 tempQuals = {identifier : [locus], "ID" : [tempName], "Notes" : ["Gene feature automatically generated by Gbk to GFF conversion"]} | |
218 topFeat = gffSeqFeature(FeatureLocation(minLoc, maxLoc, possibleStrand), 'gene', '', possibleStrand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff") | |
219 else: | |
220 sys.stderr.write("Unable to create a feature heirarchy at location [%d, %d] with features: \n" % (minLoc, maxLoc)) | |
221 for x in locBucket[locus]: | |
222 sys.stderr.write(str(x)) | |
223 sys.stderr.write('\n') | |
224 failed = 1 | |
225 continue | |
226 | |
227 outFeats.append(makeGffFeat(topFeat, 0, recID, identifier)) | |
228 if not midFeat and topFeat.type == "gene" and makeMRNA: | |
229 if identifier in topFeat.qualifiers.keys(): | |
230 tempName = topFeat.qualifiers[identifier][0] + ".mRNA" | |
231 tempQuals = {identifier : topFeat.qualifiers[identifier], "ID" : [tempName], "Notes" : ["mRNA feature automatically generated by Gbk to GFF conversion"]} | |
232 else: | |
233 tempName = outFeats[-1].ID + ".mRNA" | |
234 tempQuals = {identifier : topFeat.qualifiers[identifier], "ID" : [tempName], "Notes" : ["mRNA feature automatically generated by Gbk to GFF conversion"]} | |
235 midFeat = gffSeqFeature(FeatureLocation(minLoc, maxLoc, topFeat.strand), 'mRNA', '', topFeat.strand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff") | |
236 | |
237 if midFeat: # Again, need a new if statement if we want to handle multiple mid-tier features | |
238 outFeats[-1].sub_features.append(makeGffFeat(midFeat, 0, recID, identifier)) | |
239 outFeats[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].id] | |
240 for x in bottomFeats: | |
241 typeDict[x.type] += 1 | |
242 outFeats[-1].sub_features[-1].sub_features.append(makeGffFeat(x, typeDict[x.type], recID, identifier)) | |
243 outFeats[-1].sub_features[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].sub_features[-1].id] | |
244 else: # No midFeat, append bottom feats directly to top feats | |
245 for x in bottomFeats: | |
246 typeDict[x.type] += 1 | |
247 outFeats[-1].sub_features.append(makeGffFeat(x, typeDict[x.type], recID, identifier)) | |
248 outFeats[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].id] | |
249 | |
250 outRec.append(SeqRecord(rec.seq, recID, rec.name, rec.description, rec.dbxrefs, sorted(outFeats, key=lambda x: x.location.start), rec.annotations, rec.letter_annotations)) | |
251 SeqIO.write([outRec[-1]], fastaFile, "fasta") | |
252 gffWrite(outRec, ofh) | |
253 exit(failed) # 0 if all features handled, 1 if unable to handle some | |
254 | |
255 | |
256 if __name__ == '__main__': | |
257 parser = argparse.ArgumentParser( description='Biopython solution to Gbk to GFF conversion') | |
258 | |
259 parser.add_argument('inFile', type=argparse.FileType("r"), help='Path to an input GBK file' ) | |
260 parser.add_argument('--makeMRNA', action="store_true", required=False, help="Automatically create mRNA features") | |
261 parser.add_argument('--makeGene', action="store_true", required=False, help="Automatically create missing Gene features") | |
262 parser.add_argument('--identifier', type=str, default="locus_tag", required=False, help="Qualifier to derive ID property from") | |
263 parser.add_argument('--fastaFile', type=argparse.FileType("w"), help='Fasta output for sequences' ) | |
264 parser.add_argument('--outFile', type=argparse.FileType("w"), help='GFF feature output' ) | |
265 args = parser.parse_args() | |
266 main(**vars(args)) | |
267 | |
268 | |
269 | |
270 | |
271 | |
272 | |
273 | |
274 |