annotate cpt_gbk_to_gff/gbk_to_gff3.py @ 0:a68f32350196 draft

Uploaded
author cpt
date Fri, 17 Jun 2022 12:46:43 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a68f32350196 Uploaded
cpt
parents:
diff changeset
1 #!/usr/bin/env python
a68f32350196 Uploaded
cpt
parents:
diff changeset
2
a68f32350196 Uploaded
cpt
parents:
diff changeset
3 import argparse
a68f32350196 Uploaded
cpt
parents:
diff changeset
4 import sys
a68f32350196 Uploaded
cpt
parents:
diff changeset
5
a68f32350196 Uploaded
cpt
parents:
diff changeset
6 from Bio import SeqIO
a68f32350196 Uploaded
cpt
parents:
diff changeset
7 from Bio.SeqRecord import SeqRecord
a68f32350196 Uploaded
cpt
parents:
diff changeset
8 from Bio.SeqFeature import FeatureLocation
a68f32350196 Uploaded
cpt
parents:
diff changeset
9 from CPT_GFFParser import gffSeqFeature, gffWrite
a68f32350196 Uploaded
cpt
parents:
diff changeset
10
a68f32350196 Uploaded
cpt
parents:
diff changeset
11 bottomFeatTypes = ["exon", "RBS", "CDS"]
a68f32350196 Uploaded
cpt
parents:
diff changeset
12
a68f32350196 Uploaded
cpt
parents:
diff changeset
13 def makeGffFeat(inFeat, num, recName, identifier):
a68f32350196 Uploaded
cpt
parents:
diff changeset
14 if inFeat.type == "RBS" or (inFeat.type == "regulatory" and "regulatory_class" in inFeat.qualifiers.keys() and inFeat.qualifiers["regulatory_class"][0] == "ribosome_binding_site"):
a68f32350196 Uploaded
cpt
parents:
diff changeset
15 inFeat.type = "Shine_Dalgarno_sequence"
a68f32350196 Uploaded
cpt
parents:
diff changeset
16 if "codon_start" in inFeat.qualifiers.keys():
a68f32350196 Uploaded
cpt
parents:
diff changeset
17 shift = int(inFeat.qualifiers["codon_start"][0]) - 1
a68f32350196 Uploaded
cpt
parents:
diff changeset
18 else:
a68f32350196 Uploaded
cpt
parents:
diff changeset
19 shift = "."
a68f32350196 Uploaded
cpt
parents:
diff changeset
20 if identifier in inFeat.qualifiers.keys():
a68f32350196 Uploaded
cpt
parents:
diff changeset
21 name = inFeat.qualifiers[identifier][0] + "." + inFeat.type
a68f32350196 Uploaded
cpt
parents:
diff changeset
22 if num > 0:
a68f32350196 Uploaded
cpt
parents:
diff changeset
23 name += "." + str(num)
a68f32350196 Uploaded
cpt
parents:
diff changeset
24 else:
a68f32350196 Uploaded
cpt
parents:
diff changeset
25 name = recName + "." + inFeat.type + "." + str(num)
a68f32350196 Uploaded
cpt
parents:
diff changeset
26
a68f32350196 Uploaded
cpt
parents:
diff changeset
27 outFeat = gffSeqFeature(inFeat.location, inFeat.type, '', inFeat.strand, name, inFeat.qualifiers, None, None, None, shift, 0, "GbkToGff")
a68f32350196 Uploaded
cpt
parents:
diff changeset
28 outFeat.qualifiers["ID"] = [name]
a68f32350196 Uploaded
cpt
parents:
diff changeset
29 return outFeat
a68f32350196 Uploaded
cpt
parents:
diff changeset
30
a68f32350196 Uploaded
cpt
parents:
diff changeset
31 def main(inFile, makeMRNA, makeGene, identifier, fastaFile, outFile):
a68f32350196 Uploaded
cpt
parents:
diff changeset
32
a68f32350196 Uploaded
cpt
parents:
diff changeset
33 ofh = sys.stdout
a68f32350196 Uploaded
cpt
parents:
diff changeset
34 if outFile:
a68f32350196 Uploaded
cpt
parents:
diff changeset
35 ofh = outFile
a68f32350196 Uploaded
cpt
parents:
diff changeset
36
a68f32350196 Uploaded
cpt
parents:
diff changeset
37 outRec = []
a68f32350196 Uploaded
cpt
parents:
diff changeset
38 failed = 0
a68f32350196 Uploaded
cpt
parents:
diff changeset
39 for rec in SeqIO.parse(inFile, "genbank"):
a68f32350196 Uploaded
cpt
parents:
diff changeset
40 recID = rec.name
a68f32350196 Uploaded
cpt
parents:
diff changeset
41
a68f32350196 Uploaded
cpt
parents:
diff changeset
42 if len(str(rec.seq)) > 0:
a68f32350196 Uploaded
cpt
parents:
diff changeset
43 seqs_pending_writes = True
a68f32350196 Uploaded
cpt
parents:
diff changeset
44 outSeq = str(rec.seq)
a68f32350196 Uploaded
cpt
parents:
diff changeset
45 seqLen = len(outSeq)
a68f32350196 Uploaded
cpt
parents:
diff changeset
46
a68f32350196 Uploaded
cpt
parents:
diff changeset
47 locBucket = {}
a68f32350196 Uploaded
cpt
parents:
diff changeset
48 outFeats = []
a68f32350196 Uploaded
cpt
parents:
diff changeset
49 topTypeDict = {}
a68f32350196 Uploaded
cpt
parents:
diff changeset
50 seekingParent = []
a68f32350196 Uploaded
cpt
parents:
diff changeset
51 geneNum = 0
a68f32350196 Uploaded
cpt
parents:
diff changeset
52 autoGeneNum = 0
a68f32350196 Uploaded
cpt
parents:
diff changeset
53 for feat in rec.features:
a68f32350196 Uploaded
cpt
parents:
diff changeset
54 if identifier not in feat.qualifiers.keys(): #Allow metadata features and other features with no ID (Output warning?) - AJC
a68f32350196 Uploaded
cpt
parents:
diff changeset
55 if feat.type in bottomFeatTypes:
a68f32350196 Uploaded
cpt
parents:
diff changeset
56 seekingParent.append([feat, [], []]) # [Feature, all parent candidates, strongest parent candidates]
a68f32350196 Uploaded
cpt
parents:
diff changeset
57 continue
a68f32350196 Uploaded
cpt
parents:
diff changeset
58 elif feat.type not in topTypeDict.keys():
a68f32350196 Uploaded
cpt
parents:
diff changeset
59 topTypeDict[feat.type] = 1
a68f32350196 Uploaded
cpt
parents:
diff changeset
60 else:
a68f32350196 Uploaded
cpt
parents:
diff changeset
61 topTypeDict[feat.type] += 1
a68f32350196 Uploaded
cpt
parents:
diff changeset
62 outFeats.append(makeGffFeat(feat, topTypeDict[feat.type], recID, identifier))
a68f32350196 Uploaded
cpt
parents:
diff changeset
63 continue
a68f32350196 Uploaded
cpt
parents:
diff changeset
64 elif feat.qualifiers[identifier][0] not in locBucket.keys():
a68f32350196 Uploaded
cpt
parents:
diff changeset
65 locBucket[feat.qualifiers[identifier][0]] = []
a68f32350196 Uploaded
cpt
parents:
diff changeset
66 locBucket[feat.qualifiers[identifier][0]].append(feat)
a68f32350196 Uploaded
cpt
parents:
diff changeset
67
a68f32350196 Uploaded
cpt
parents:
diff changeset
68 for locus in locBucket.keys():
a68f32350196 Uploaded
cpt
parents:
diff changeset
69 minLoc = locBucket[locus][0].location.start
a68f32350196 Uploaded
cpt
parents:
diff changeset
70 maxLoc = locBucket[locus][0].location.end
a68f32350196 Uploaded
cpt
parents:
diff changeset
71 for feat in locBucket[locus]:
a68f32350196 Uploaded
cpt
parents:
diff changeset
72 minLoc = min(minLoc, feat.location.start)
a68f32350196 Uploaded
cpt
parents:
diff changeset
73 maxLoc = max(maxLoc, feat.location.end)
a68f32350196 Uploaded
cpt
parents:
diff changeset
74 for x in seekingParent:
a68f32350196 Uploaded
cpt
parents:
diff changeset
75 if x[0].location.start >= minLoc and x[0].location.end <= maxLoc:
a68f32350196 Uploaded
cpt
parents:
diff changeset
76 x[1].append(locus)
a68f32350196 Uploaded
cpt
parents:
diff changeset
77 if x[0].location.start == minLoc or x[0].location.end == maxLoc:
a68f32350196 Uploaded
cpt
parents:
diff changeset
78 x[2].append(locus)
a68f32350196 Uploaded
cpt
parents:
diff changeset
79
a68f32350196 Uploaded
cpt
parents:
diff changeset
80 for x in seekingParent: #Reformat to [Feature, Locus, Unused/Free]
a68f32350196 Uploaded
cpt
parents:
diff changeset
81 if len(x[2]) == 1:
a68f32350196 Uploaded
cpt
parents:
diff changeset
82 finList = ""
a68f32350196 Uploaded
cpt
parents:
diff changeset
83 if len(x[1]) > 1:
a68f32350196 Uploaded
cpt
parents:
diff changeset
84 for loc in x[1]:
a68f32350196 Uploaded
cpt
parents:
diff changeset
85 if loc != x[2][0]:
a68f32350196 Uploaded
cpt
parents:
diff changeset
86 finList += loc + ", "
a68f32350196 Uploaded
cpt
parents:
diff changeset
87 finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived. Other, weaker candidate(s) were " + finList[0:-2] + "."
a68f32350196 Uploaded
cpt
parents:
diff changeset
88 else:
a68f32350196 Uploaded
cpt
parents:
diff changeset
89 finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived."
a68f32350196 Uploaded
cpt
parents:
diff changeset
90 if "Notes" not in x[0].qualifiers.keys():
a68f32350196 Uploaded
cpt
parents:
diff changeset
91 x[0].qualifiers["Notes"] = []
a68f32350196 Uploaded
cpt
parents:
diff changeset
92 x[0].qualifiers["Notes"].append(finList)
a68f32350196 Uploaded
cpt
parents:
diff changeset
93 x[1] = x[2][0]
a68f32350196 Uploaded
cpt
parents:
diff changeset
94 elif len(x[2]) > 1:
a68f32350196 Uploaded
cpt
parents:
diff changeset
95 candidate = x[2][0] #Arbitrarily choose first one
a68f32350196 Uploaded
cpt
parents:
diff changeset
96 finList = ""
a68f32350196 Uploaded
cpt
parents:
diff changeset
97 strongList = ""
a68f32350196 Uploaded
cpt
parents:
diff changeset
98 for loc in x[2]:
a68f32350196 Uploaded
cpt
parents:
diff changeset
99 if loc != candidate:
a68f32350196 Uploaded
cpt
parents:
diff changeset
100 finList += loc + ", "
a68f32350196 Uploaded
cpt
parents:
diff changeset
101 strongList += loc + ", "
a68f32350196 Uploaded
cpt
parents:
diff changeset
102 for loc in x[1]:
a68f32350196 Uploaded
cpt
parents:
diff changeset
103 if loc not in x[2]:
a68f32350196 Uploaded
cpt
parents:
diff changeset
104 finList += loc + ", "
a68f32350196 Uploaded
cpt
parents:
diff changeset
105 finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived. Other candidate(s) were " + finList[0:-2] + " (Equally strong candidate(s): " + strongList[0:-2] + ")."
a68f32350196 Uploaded
cpt
parents:
diff changeset
106 if "Notes" not in x[0].qualifiers.keys():
a68f32350196 Uploaded
cpt
parents:
diff changeset
107 x[0].qualifiers["Notes"] = []
a68f32350196 Uploaded
cpt
parents:
diff changeset
108 x[0].qualifiers["Notes"].append(finList)
a68f32350196 Uploaded
cpt
parents:
diff changeset
109 x[1] = candidate
a68f32350196 Uploaded
cpt
parents:
diff changeset
110 elif len(x[1]) == 1:
a68f32350196 Uploaded
cpt
parents:
diff changeset
111 x[1] = x[1][0]
a68f32350196 Uploaded
cpt
parents:
diff changeset
112 if "Notes" not in x[0].qualifiers.keys():
a68f32350196 Uploaded
cpt
parents:
diff changeset
113 x[0].qualifiers["Notes"] = []
a68f32350196 Uploaded
cpt
parents:
diff changeset
114 finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived."
a68f32350196 Uploaded
cpt
parents:
diff changeset
115 x[0].qualifiers["Notes"].append(finList)
a68f32350196 Uploaded
cpt
parents:
diff changeset
116 elif len(x[1]) > 1:
a68f32350196 Uploaded
cpt
parents:
diff changeset
117 candidate = x[1][0] #Arbitrarily choose first one
a68f32350196 Uploaded
cpt
parents:
diff changeset
118 finList = ""
a68f32350196 Uploaded
cpt
parents:
diff changeset
119 for loc in x[1]:
a68f32350196 Uploaded
cpt
parents:
diff changeset
120 if loc != candidate:
a68f32350196 Uploaded
cpt
parents:
diff changeset
121 finList += loc + ", "
a68f32350196 Uploaded
cpt
parents:
diff changeset
122 finList = str(x[0].type) + " had no locus tag set in .gbk file, automatically derived. Other candidates were " + finList[0:-2] + "."
a68f32350196 Uploaded
cpt
parents:
diff changeset
123 if "Notes" not in x[0].qualifiers.keys():
a68f32350196 Uploaded
cpt
parents:
diff changeset
124 x[0].qualifiers["Notes"] = []
a68f32350196 Uploaded
cpt
parents:
diff changeset
125 x[0].qualifiers["Notes"].append(finList)
a68f32350196 Uploaded
cpt
parents:
diff changeset
126 x[1] = candidate
a68f32350196 Uploaded
cpt
parents:
diff changeset
127 else:
a68f32350196 Uploaded
cpt
parents:
diff changeset
128 if makeGene:
a68f32350196 Uploaded
cpt
parents:
diff changeset
129 sys.stderr.write("Warning: Unable to find potential parent for feature with no " + identifier + " of type " + str(x[0].type) + " at location [" + str(x[0].location.start + 1) + ", " + str(x[0].location.end) + "], creating standalone gene.\n")
a68f32350196 Uploaded
cpt
parents:
diff changeset
130 autoGeneNum += 1
a68f32350196 Uploaded
cpt
parents:
diff changeset
131 x[0].source = "GbkToGff"
a68f32350196 Uploaded
cpt
parents:
diff changeset
132 x[0].score = 0
a68f32350196 Uploaded
cpt
parents:
diff changeset
133 x[0].shift = 0
a68f32350196 Uploaded
cpt
parents:
diff changeset
134 if "ID" not in x[0].qualifiers.keys():
a68f32350196 Uploaded
cpt
parents:
diff changeset
135 x[0].qualifiers["ID"] = [recID + ".standalone_" + x[0].type + "." + str(autoGeneNum)]
a68f32350196 Uploaded
cpt
parents:
diff changeset
136 tempName = recID + ".derived_Gene." + str(autoGeneNum)
a68f32350196 Uploaded
cpt
parents:
diff changeset
137 tempQuals = {"ID" : [tempName], "Notes" : ["Gene feature automatically generated by Gbk to GFF conversion"]}
a68f32350196 Uploaded
cpt
parents:
diff changeset
138 tempGene = gffSeqFeature(FeatureLocation(x[0].location.start, x[0].location.end, x[0].location.strand), 'gene', '', x[0].strand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff")
a68f32350196 Uploaded
cpt
parents:
diff changeset
139 if makeMRNA:
a68f32350196 Uploaded
cpt
parents:
diff changeset
140 tempName = recID + ".derived_mRNA." + str(autoGeneNum)
a68f32350196 Uploaded
cpt
parents:
diff changeset
141 tempQuals = {"ID" : [tempName], "Notes" : ["mRNA feature automatically generated by Gbk to GFF conversion"]}
a68f32350196 Uploaded
cpt
parents:
diff changeset
142 tempGene.sub_features.append(gffSeqFeature(FeatureLocation(x[0].location.start, x[0].location.end, x[0].location.strand), 'mRNA', '', x[0].strand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff"))
a68f32350196 Uploaded
cpt
parents:
diff changeset
143 tempGene.sub_features[-1].sub_features.append(x[0])
a68f32350196 Uploaded
cpt
parents:
diff changeset
144 else:
a68f32350196 Uploaded
cpt
parents:
diff changeset
145 tempGene.sub_features.append(x[0])
a68f32350196 Uploaded
cpt
parents:
diff changeset
146
a68f32350196 Uploaded
cpt
parents:
diff changeset
147
a68f32350196 Uploaded
cpt
parents:
diff changeset
148 outFeats.append(tempGene)
a68f32350196 Uploaded
cpt
parents:
diff changeset
149 else:
a68f32350196 Uploaded
cpt
parents:
diff changeset
150 sys.stderr.write("Warning: Unable to find potential parent for feature with no " + identifier + " of type " + str(x[0].type) + " at location [" + str(x[0].location.start + 1) + ", " + str(x[0].location.end) + "].\n")
a68f32350196 Uploaded
cpt
parents:
diff changeset
151 if x[0].type not in topTypeDict.keys():
a68f32350196 Uploaded
cpt
parents:
diff changeset
152 topTypeDict[x[0].type] = 1
a68f32350196 Uploaded
cpt
parents:
diff changeset
153 else:
a68f32350196 Uploaded
cpt
parents:
diff changeset
154 topTypeDict[x[0].type] += 1
a68f32350196 Uploaded
cpt
parents:
diff changeset
155 outFeats.append(makeGffFeat(x[0], topTypeDict[x[0].type], recID, identifier))
a68f32350196 Uploaded
cpt
parents:
diff changeset
156
a68f32350196 Uploaded
cpt
parents:
diff changeset
157 for locus in locBucket.keys():
a68f32350196 Uploaded
cpt
parents:
diff changeset
158 if len(locBucket[locus]) == 1: # No heirarchy to be made
a68f32350196 Uploaded
cpt
parents:
diff changeset
159 outFeats.append(makeGffFeat(locBucket[locus][0], 0, recID, identifier))
a68f32350196 Uploaded
cpt
parents:
diff changeset
160 continue
a68f32350196 Uploaded
cpt
parents:
diff changeset
161 topFeat = None
a68f32350196 Uploaded
cpt
parents:
diff changeset
162 midFeat = None
a68f32350196 Uploaded
cpt
parents:
diff changeset
163 bottomFeats = []
a68f32350196 Uploaded
cpt
parents:
diff changeset
164 typeDict = {}
a68f32350196 Uploaded
cpt
parents:
diff changeset
165 minLoc = locBucket[locus][0].location.start
a68f32350196 Uploaded
cpt
parents:
diff changeset
166 maxLoc = locBucket[locus][0].location.end
a68f32350196 Uploaded
cpt
parents:
diff changeset
167 geneNum += 1
a68f32350196 Uploaded
cpt
parents:
diff changeset
168 for feat in locBucket[locus]:
a68f32350196 Uploaded
cpt
parents:
diff changeset
169 # If we want to make our own top-level feat?
a68f32350196 Uploaded
cpt
parents:
diff changeset
170 minLoc = min(minLoc, feat.location.start)
a68f32350196 Uploaded
cpt
parents:
diff changeset
171 maxLoc = max(maxLoc, feat.location.end)
a68f32350196 Uploaded
cpt
parents:
diff changeset
172
a68f32350196 Uploaded
cpt
parents:
diff changeset
173 # Gene->mRNA->CDS included as example, to add other feature-heirarchys in the appropriate slot
a68f32350196 Uploaded
cpt
parents:
diff changeset
174 if feat.type in ['gene']:
a68f32350196 Uploaded
cpt
parents:
diff changeset
175 if not topFeat:
a68f32350196 Uploaded
cpt
parents:
diff changeset
176 topFeat = feat
a68f32350196 Uploaded
cpt
parents:
diff changeset
177 # Else handle multiple top features
a68f32350196 Uploaded
cpt
parents:
diff changeset
178 elif feat.type in ['mRNA', 'tRNA', 'rRNA']:
a68f32350196 Uploaded
cpt
parents:
diff changeset
179 if not midFeat:
a68f32350196 Uploaded
cpt
parents:
diff changeset
180 midFeat = feat
a68f32350196 Uploaded
cpt
parents:
diff changeset
181 # Else handle multiple mid feats (May need another elif type-in-list statement if we actually expect a list of mid feats)
a68f32350196 Uploaded
cpt
parents:
diff changeset
182 else:
a68f32350196 Uploaded
cpt
parents:
diff changeset
183 if feat.type not in typeDict.keys():
a68f32350196 Uploaded
cpt
parents:
diff changeset
184 typeDict[feat.type] = 1
a68f32350196 Uploaded
cpt
parents:
diff changeset
185 else:
a68f32350196 Uploaded
cpt
parents:
diff changeset
186 typeDict[feat.type] += 1
a68f32350196 Uploaded
cpt
parents:
diff changeset
187 bottomFeats.append(feat)
a68f32350196 Uploaded
cpt
parents:
diff changeset
188
a68f32350196 Uploaded
cpt
parents:
diff changeset
189 for x in seekingParent:
a68f32350196 Uploaded
cpt
parents:
diff changeset
190 if type(x[1]) != "list" and locus == x[1]:
a68f32350196 Uploaded
cpt
parents:
diff changeset
191 x[0].qualifiers[identifier] = [locus]
a68f32350196 Uploaded
cpt
parents:
diff changeset
192 bottomFeats.append(x[0])
a68f32350196 Uploaded
cpt
parents:
diff changeset
193 if x[0].type not in typeDict.keys():
a68f32350196 Uploaded
cpt
parents:
diff changeset
194 typeDict[x[0].type] = 1
a68f32350196 Uploaded
cpt
parents:
diff changeset
195 else:
a68f32350196 Uploaded
cpt
parents:
diff changeset
196 typeDict[x[0].type] += 1
a68f32350196 Uploaded
cpt
parents:
diff changeset
197
a68f32350196 Uploaded
cpt
parents:
diff changeset
198
a68f32350196 Uploaded
cpt
parents:
diff changeset
199
a68f32350196 Uploaded
cpt
parents:
diff changeset
200
a68f32350196 Uploaded
cpt
parents:
diff changeset
201
a68f32350196 Uploaded
cpt
parents:
diff changeset
202 #if not topFeat: # Make our own top-level feature based off minLoc, maxLoc bounds
a68f32350196 Uploaded
cpt
parents:
diff changeset
203
a68f32350196 Uploaded
cpt
parents:
diff changeset
204 for x in typeDict.keys(): # If only 1, set it to 0 so we don't append a number to the name
a68f32350196 Uploaded
cpt
parents:
diff changeset
205 if typeDict[x] == 1: # Else, set to 1 so that we count up as we encounter the features
a68f32350196 Uploaded
cpt
parents:
diff changeset
206 typeDict[x] = 0
a68f32350196 Uploaded
cpt
parents:
diff changeset
207 else:
a68f32350196 Uploaded
cpt
parents:
diff changeset
208 typeDict[x] = 1
a68f32350196 Uploaded
cpt
parents:
diff changeset
209
a68f32350196 Uploaded
cpt
parents:
diff changeset
210 if not topFeat:
a68f32350196 Uploaded
cpt
parents:
diff changeset
211 if makeGene:
a68f32350196 Uploaded
cpt
parents:
diff changeset
212 if midFeat:
a68f32350196 Uploaded
cpt
parents:
diff changeset
213 possibleStrand = midFeat.strand
a68f32350196 Uploaded
cpt
parents:
diff changeset
214 else:
a68f32350196 Uploaded
cpt
parents:
diff changeset
215 possibleStrand = bottomFeats[0].strand
a68f32350196 Uploaded
cpt
parents:
diff changeset
216 tempName = recID + ".gene." + str(geneNum)
a68f32350196 Uploaded
cpt
parents:
diff changeset
217 tempQuals = {identifier : [locus], "ID" : [tempName], "Notes" : ["Gene feature automatically generated by Gbk to GFF conversion"]}
a68f32350196 Uploaded
cpt
parents:
diff changeset
218 topFeat = gffSeqFeature(FeatureLocation(minLoc, maxLoc, possibleStrand), 'gene', '', possibleStrand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff")
a68f32350196 Uploaded
cpt
parents:
diff changeset
219 else:
a68f32350196 Uploaded
cpt
parents:
diff changeset
220 sys.stderr.write("Unable to create a feature heirarchy at location [%d, %d] with features: \n" % (minLoc, maxLoc))
a68f32350196 Uploaded
cpt
parents:
diff changeset
221 for x in locBucket[locus]:
a68f32350196 Uploaded
cpt
parents:
diff changeset
222 sys.stderr.write(str(x))
a68f32350196 Uploaded
cpt
parents:
diff changeset
223 sys.stderr.write('\n')
a68f32350196 Uploaded
cpt
parents:
diff changeset
224 failed = 1
a68f32350196 Uploaded
cpt
parents:
diff changeset
225 continue
a68f32350196 Uploaded
cpt
parents:
diff changeset
226
a68f32350196 Uploaded
cpt
parents:
diff changeset
227 outFeats.append(makeGffFeat(topFeat, 0, recID, identifier))
a68f32350196 Uploaded
cpt
parents:
diff changeset
228 if not midFeat and topFeat.type == "gene" and makeMRNA:
a68f32350196 Uploaded
cpt
parents:
diff changeset
229 if identifier in topFeat.qualifiers.keys():
a68f32350196 Uploaded
cpt
parents:
diff changeset
230 tempName = topFeat.qualifiers[identifier][0] + ".mRNA"
a68f32350196 Uploaded
cpt
parents:
diff changeset
231 tempQuals = {identifier : topFeat.qualifiers[identifier], "ID" : [tempName], "Notes" : ["mRNA feature automatically generated by Gbk to GFF conversion"]}
a68f32350196 Uploaded
cpt
parents:
diff changeset
232 else:
a68f32350196 Uploaded
cpt
parents:
diff changeset
233 tempName = outFeats[-1].ID + ".mRNA"
a68f32350196 Uploaded
cpt
parents:
diff changeset
234 tempQuals = {identifier : topFeat.qualifiers[identifier], "ID" : [tempName], "Notes" : ["mRNA feature automatically generated by Gbk to GFF conversion"]}
a68f32350196 Uploaded
cpt
parents:
diff changeset
235 midFeat = gffSeqFeature(FeatureLocation(minLoc, maxLoc, topFeat.strand), 'mRNA', '', topFeat.strand, tempName, tempQuals, None, None, None, ".", 0, "GbkToGff")
a68f32350196 Uploaded
cpt
parents:
diff changeset
236
a68f32350196 Uploaded
cpt
parents:
diff changeset
237 if midFeat: # Again, need a new if statement if we want to handle multiple mid-tier features
a68f32350196 Uploaded
cpt
parents:
diff changeset
238 outFeats[-1].sub_features.append(makeGffFeat(midFeat, 0, recID, identifier))
a68f32350196 Uploaded
cpt
parents:
diff changeset
239 outFeats[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].id]
a68f32350196 Uploaded
cpt
parents:
diff changeset
240 for x in bottomFeats:
a68f32350196 Uploaded
cpt
parents:
diff changeset
241 typeDict[x.type] += 1
a68f32350196 Uploaded
cpt
parents:
diff changeset
242 outFeats[-1].sub_features[-1].sub_features.append(makeGffFeat(x, typeDict[x.type], recID, identifier))
a68f32350196 Uploaded
cpt
parents:
diff changeset
243 outFeats[-1].sub_features[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].sub_features[-1].id]
a68f32350196 Uploaded
cpt
parents:
diff changeset
244 else: # No midFeat, append bottom feats directly to top feats
a68f32350196 Uploaded
cpt
parents:
diff changeset
245 for x in bottomFeats:
a68f32350196 Uploaded
cpt
parents:
diff changeset
246 typeDict[x.type] += 1
a68f32350196 Uploaded
cpt
parents:
diff changeset
247 outFeats[-1].sub_features.append(makeGffFeat(x, typeDict[x.type], recID, identifier))
a68f32350196 Uploaded
cpt
parents:
diff changeset
248 outFeats[-1].sub_features[-1].qualifiers["Parent"] = [outFeats[-1].id]
a68f32350196 Uploaded
cpt
parents:
diff changeset
249
a68f32350196 Uploaded
cpt
parents:
diff changeset
250 outRec.append(SeqRecord(rec.seq, recID, rec.name, rec.description, rec.dbxrefs, sorted(outFeats, key=lambda x: x.location.start), rec.annotations, rec.letter_annotations))
a68f32350196 Uploaded
cpt
parents:
diff changeset
251 SeqIO.write([outRec[-1]], fastaFile, "fasta")
a68f32350196 Uploaded
cpt
parents:
diff changeset
252 gffWrite(outRec, ofh)
a68f32350196 Uploaded
cpt
parents:
diff changeset
253 exit(failed) # 0 if all features handled, 1 if unable to handle some
a68f32350196 Uploaded
cpt
parents:
diff changeset
254
a68f32350196 Uploaded
cpt
parents:
diff changeset
255
a68f32350196 Uploaded
cpt
parents:
diff changeset
256 if __name__ == '__main__':
a68f32350196 Uploaded
cpt
parents:
diff changeset
257 parser = argparse.ArgumentParser( description='Biopython solution to Gbk to GFF conversion')
a68f32350196 Uploaded
cpt
parents:
diff changeset
258
a68f32350196 Uploaded
cpt
parents:
diff changeset
259 parser.add_argument('inFile', type=argparse.FileType("r"), help='Path to an input GBK file' )
a68f32350196 Uploaded
cpt
parents:
diff changeset
260 parser.add_argument('--makeMRNA', action="store_true", required=False, help="Automatically create mRNA features")
a68f32350196 Uploaded
cpt
parents:
diff changeset
261 parser.add_argument('--makeGene', action="store_true", required=False, help="Automatically create missing Gene features")
a68f32350196 Uploaded
cpt
parents:
diff changeset
262 parser.add_argument('--identifier', type=str, default="locus_tag", required=False, help="Qualifier to derive ID property from")
a68f32350196 Uploaded
cpt
parents:
diff changeset
263 parser.add_argument('--fastaFile', type=argparse.FileType("w"), help='Fasta output for sequences' )
a68f32350196 Uploaded
cpt
parents:
diff changeset
264 parser.add_argument('--outFile', type=argparse.FileType("w"), help='GFF feature output' )
a68f32350196 Uploaded
cpt
parents:
diff changeset
265 args = parser.parse_args()
a68f32350196 Uploaded
cpt
parents:
diff changeset
266 main(**vars(args))
a68f32350196 Uploaded
cpt
parents:
diff changeset
267
a68f32350196 Uploaded
cpt
parents:
diff changeset
268
a68f32350196 Uploaded
cpt
parents:
diff changeset
269
a68f32350196 Uploaded
cpt
parents:
diff changeset
270
a68f32350196 Uploaded
cpt
parents:
diff changeset
271
a68f32350196 Uploaded
cpt
parents:
diff changeset
272
a68f32350196 Uploaded
cpt
parents:
diff changeset
273
a68f32350196 Uploaded
cpt
parents:
diff changeset
274