3
|
1 #!/usr/bin/env python3.4
|
|
2 # Author: Jasper Jan Koehorst
|
|
3 # Date created: Feb 21 2015
|
|
4 # Function: generation of a RDF file from Genbank/EMBL
|
|
5
|
|
6 import warnings
|
|
7 warnings.filterwarnings("ignore")
|
|
8
|
|
9 def delete_galaxy():
|
|
10 import sys
|
|
11 for index, path in enumerate(sys.path):
|
|
12 if "galaxy-dist/" in path:
|
|
13 sys.path[index] = ''
|
|
14
|
|
15 #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. This is not an elegant solution but it works for now.
|
|
16 delete_galaxy()
|
|
17
|
|
18 from Bio import SeqIO
|
|
19 # Import RDFLib's default Graph implementation.
|
|
20 import os, sys
|
|
21 from Bio.Seq import Seq
|
|
22
|
|
23 from rdflib import Graph, URIRef, Literal,Namespace,RDF,RDFS,OWL, plugin
|
|
24 from rdflib.store import Store
|
|
25 import hashlib
|
|
26 store = plugin.get('IOMemory', Store)()
|
|
27
|
|
28 global URI
|
|
29 URI = "http://csb.wur.nl/genome/"
|
|
30 global seeAlso
|
|
31 seeAlso = "rdfs:seeAlso"
|
|
32 global coreURI
|
|
33 coreURI = Namespace(URI)
|
|
34
|
|
35 global SubClassOfDict
|
|
36 SubClassOfDict = {}
|
|
37 global SubClassOfDictRna
|
|
38 SubClassOfDictRna = {}
|
|
39
|
|
40 def createClass(uri, root=True):
|
|
41 genomeGraph.add((uri,RDF.type,OWL.Class))
|
|
42 if root:
|
|
43 genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
|
|
44 return uri
|
|
45
|
|
46 def tmp():
|
|
47 import time
|
|
48 global tmpFolder
|
|
49 tmpFolder = "/tmp/"+str(time.time())+"/"
|
|
50 os.mkdir(tmpFolder)
|
|
51
|
|
52 def cleantmp():
|
|
53 os.system("ls "+tmpFolder)
|
|
54 os.system("rm -rf "+tmpFolder)
|
|
55
|
|
56 def crawler():
|
|
57 #From input folder it looks for GBK file (gz files are in progress)
|
|
58 input_file = sys.argv[sys.argv.index("-input")+1]
|
|
59 gbk_parser(input_file)
|
|
60
|
|
61 def gbk_parser():
|
|
62 prevObjStart = -1
|
|
63 prevObjStop = -1
|
|
64 store = plugin.get('IOMemory', Store)()
|
|
65 global genomeGraph
|
|
66 genomeGraph = Graph(store,URIRef(URI))
|
|
67 genomeGraph.bind("ssb",coreURI)
|
|
68 input_file = sys.argv[sys.argv.index("-input")+1]
|
|
69
|
|
70 #CLASS definitions
|
|
71 genomeClass = createClass(coreURI["Genome"], root=True)
|
|
72 typeClass = createClass(coreURI["DnaObject"], root=True)
|
|
73 createClass(coreURI["Protein"], root=True)
|
|
74 pubmedClass = createClass(coreURI["Pubmed"], root=True)
|
|
75 miscClass = createClass(coreURI["MiscFeature"], root=False)
|
|
76 createClass(coreURI["Feature"], root=True)
|
|
77 SubClassOfDict["MiscFeature"] = 1
|
|
78 SubClassOfDictRna["Trna"] = 1
|
|
79 SubClassOfDictRna["Rrna"] = 1
|
|
80 SubClassOfDictRna["Tmrna"] = 1
|
|
81 SubClassOfDictRna["Ncrna"] = 1
|
|
82
|
|
83 # codon = "11" #Default initialization if no CDS are present
|
|
84 ##################
|
|
85 weird_chars = list(''',./?<>:;"'|\}]{[+=_-)(*&^%$#@!±§~` ''')
|
|
86 scaf_value = 0
|
|
87 #Which files are already done
|
|
88 ########
|
|
89 formatGBK = sys.argv[sys.argv.index("-format")+1]
|
|
90 for record in SeqIO.parse(input_file, formatGBK):
|
|
91 #Read first feature for genome name and information...
|
|
92 #Ignore the empty GBK file due to the lack of features?
|
|
93
|
|
94 for index, feature in enumerate(record.features):
|
|
95 if index == 0:
|
|
96 if "-identifier" in sys.argv:
|
|
97 genome = sys.argv[sys.argv.index("-identifier")+1]
|
|
98 else:
|
|
99 try:
|
|
100 genome = feature.qualifiers["organism"][0].replace(" ","_")
|
|
101 except:
|
|
102 #BUG: THIS IS A TEMP FIX, USE GALAXY -IDENTIFIER TO CAPTURE THIS
|
|
103 genome = "XNoneX"
|
|
104 for char in weird_chars:
|
|
105 genome = genome.replace(char,"_")
|
|
106
|
|
107 try:
|
|
108 gi = record.annotations["gi"]
|
|
109 typ = str(gi)
|
|
110 except:
|
6
|
111 try:
|
|
112 gi = record.annotations["accessions"][0]
|
|
113 typ = str(gi)
|
|
114 except:
|
|
115 scaf_value += 1
|
|
116 typ = "scaffold_"+str(scaf_value)
|
3
|
117 genomeURI = coreURI[genome]
|
|
118 gbkURI = coreURI[genome + "/" + typ]
|
|
119 #To contig connection to connect all data to it
|
|
120 genomeGraph.add((genomeURI, coreURI["dnaobject"] , gbkURI))
|
|
121
|
|
122 #General genome features also stored in the class...
|
|
123 if "genome" in feature.qualifiers:
|
|
124 genomeGraph.add((genomeURI, coreURI["organism"],Literal(feature.qualifiers["organism"][0])))
|
|
125 if "strain" in feature.qualifiers:
|
|
126 genomeGraph.add((genomeURI, coreURI["strain"],Literal(feature.qualifiers["strain"][0])))
|
|
127 if "taxonomy" in record.annotations:
|
|
128 for taxon in record.annotations["taxonomy"]:
|
|
129 genomeGraph.add((genomeURI, coreURI["taxonomy"],Literal(taxon)))
|
|
130 record.annotations["taxonomy"] = []
|
|
131 #Genome sequence#
|
|
132 sequence = str(record.seq)
|
|
133 #Verify if sequence was not empty and is now full of X or N
|
|
134 filtered_sequence = sequence.replace("X","").replace("N","")
|
|
135 if len(filtered_sequence) == 0:
|
|
136 sequence = ""
|
|
137 #Record parsing#
|
|
138 for annot in record.annotations:
|
|
139 if type(record.annotations[annot]) == list:
|
|
140 if annot == "references":
|
|
141 for references in record.annotations[annot]:
|
|
142 if references.pubmed_id != "":
|
|
143 pubmed = references.pubmed_id
|
|
144 genomeGraph.add((gbkURI, coreURI[annot.lower()] , coreURI["pubmed/"+pubmed]))
|
|
145 obj_dict = references.__dict__
|
|
146 for key in obj_dict:
|
|
147 genomeGraph.add((coreURI["pubmed/"+pubmed], coreURI[key.lower()], Literal(str(obj_dict[key]))))
|
|
148 genomeGraph.add((coreURI["pubmed/"+pubmed], RDF.type, pubmedClass))
|
|
149
|
|
150 else:
|
|
151 for a in record.annotations[annot]:
|
|
152 int_add(gbkURI,coreURI[annot.lower()],str(a))
|
|
153 else:
|
|
154 int_add(gbkURI,coreURI[annot.lower()],str(record.annotations[annot]))
|
6
|
155
|
3
|
156 #####END of RECORD####
|
|
157 if len(sequence) > 0:
|
|
158 genomeGraph.add((gbkURI, coreURI["sequence"] , Literal(sequence)))
|
|
159 genomeGraph.add((genomeURI, RDF.type,genomeClass))
|
|
160 genomeGraph.add((gbkURI, RDF.type,typeClass))
|
|
161 for key in feature.qualifiers:
|
|
162 genomeGraph.add((gbkURI, coreURI[key.lower()] , Literal(feature.qualifiers[key][0])))
|
|
163 #break
|
|
164 else: #The rest of the GBK file
|
|
165 feature_type = feature.type
|
|
166 end = str(feature.location.end).replace(">","").replace("<","")
|
|
167 start = str(feature.location.start).replace(">","").replace("<","")
|
|
168
|
|
169 strand = str(feature.location.strand)
|
|
170
|
|
171 if strand == 'None':
|
|
172 strand = 0
|
|
173 else:
|
|
174 if feature.type == "misc_feature": #Store as part of previous cds or something...
|
|
175 if strand == "-1":
|
|
176 miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(end)+"_"+str(start)]
|
|
177 else:
|
|
178 miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(start)+"_"+str(end)]
|
|
179
|
|
180 # TODO: Check if biopython has an overlap function...
|
|
181 if int(prevObjStart) <= int(start):
|
|
182 if int(end) <= int(prevObjStop):
|
|
183 pass
|
|
184 # genomeGraph.add((typeURI,coreURI["feature"],miscURI))
|
|
185 # genomeGraph.add((miscURI,RDF.type,miscClass))
|
|
186 else:
|
|
187 genomeGraph.add((gbkURI, coreURI["feature"] , miscURI))
|
|
188 genomeGraph.add((miscURI,RDF.type,miscClass))
|
|
189 else:
|
|
190 genomeGraph.add((gbkURI, coreURI["feature"] , miscURI))
|
|
191 genomeGraph.add((miscURI,RDF.type,miscClass))
|
|
192
|
|
193 store_general_information(miscURI,feature,record)
|
|
194 else:
|
|
195 prevObjStart = start
|
|
196 prevObjStop = end
|
|
197
|
|
198 if strand == "-1":
|
|
199 typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(end)+"_"+str(start)]
|
|
200 else:
|
|
201 typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(start)+"_"+str(end)]
|
|
202
|
6
|
203 #Contig specific connection
|
3
|
204 genomeGraph.add((gbkURI, coreURI["feature"] , typeURI))
|
|
205 ############################
|
|
206
|
|
207 store_general_information(typeURI,feature,record)
|
|
208
|
|
209 for subfeature in feature.sub_features:
|
|
210 strand = str(subfeature.location.strand)
|
|
211 subfeature_type = subfeature.type
|
|
212 end = str(subfeature.location.end).replace(">","").replace("<","")
|
|
213 start = str(subfeature.location.start).replace(">","").replace("<","")
|
|
214
|
|
215 if strand == "-1":
|
|
216 subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(end)+"_"+str(start)]
|
|
217 else:
|
|
218 subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(start)+"_"+str(end)]
|
|
219 genomeGraph.add((typeURI, coreURI["feature"] , subURI))
|
|
220 store_general_information(subURI,subfeature,record,feature)
|
|
221
|
6
|
222
|
3
|
223 def store_general_information(generalURI,feature,record,superfeature=""):
|
|
224 proteinClass = createClass(coreURI["Protein"], root=True)
|
|
225 sequence = str(record.seq)
|
|
226 cds_sequence = str(feature.extract(sequence))
|
|
227 #Fixes the 0 count instead of 1-count in biopython vs humans
|
|
228 feature_type = feature.type
|
|
229 end = str(feature.location.end).replace(">","").replace("<","")
|
|
230 start = str(feature.location.start).replace(">","").replace("<","")
|
|
231 strand = str(feature.location.strand)
|
|
232 if strand == "None":
|
|
233 strand = 0
|
|
234
|
|
235 genomeGraph.add((generalURI,coreURI["sourcedb"],Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
|
|
236
|
|
237 if strand == "-1":
|
|
238 genomeGraph.add((generalURI,coreURI["end"],Literal(int(start)+1)))
|
|
239 genomeGraph.add((generalURI,coreURI["begin"],Literal(int(end))))
|
|
240 else:
|
|
241 genomeGraph.add((generalURI,coreURI["begin"],Literal(int(start)+1)))
|
|
242 genomeGraph.add((generalURI,coreURI["end"],Literal(int(end))))
|
|
243 genomeGraph.add((generalURI,coreURI["strand"],Literal(int(strand))))
|
|
244 if feature.type != "misc_feature":
|
|
245 try:
|
|
246 genomeGraph.add((generalURI,coreURI["sequence"],Literal(cds_sequence)))
|
|
247 except: #When protein sequence is not given for whatever reason
|
|
248 print ("wrong?")
|
|
249
|
|
250 if feature.type == "misc_feature":
|
|
251 pass
|
|
252 else:
|
|
253 genomeGraph.add((generalURI,RDF.type,createClass(coreURI[feature_type.lower().title()], root=False)))
|
|
254 if feature_type.lower() != "rrna" and feature_type.lower() != "trna" and feature_type.lower() != "tmrna" and feature_type.lower() != "ncrna":
|
|
255 SubClassOfDict[feature_type.lower().title()] = 1
|
|
256 for key in feature.qualifiers:
|
|
257 values = feature.qualifiers[key]
|
|
258 if key == "translation":
|
|
259 pass
|
|
260 elif type(values) == list:
|
|
261 for v in values:
|
|
262 int_add(generalURI,coreURI[key.lower()],v)
|
|
263 else:
|
|
264 int_add(generalURI,coreURI[key.lower()],values)
|
|
265 if feature.type == "CDS":
|
|
266 try:
|
|
267 #Feature is normally submitted to this function
|
|
268 #IF a subfeature is submitted it is submitted as a feature
|
|
269 #And subfeature variable will contain the superfeature
|
|
270 if superfeature:
|
|
271 codon = superfeature.qualifiers["transl_table"][0]
|
|
272 except:
|
|
273 #Default codon table 11
|
|
274 codon = "11"
|
|
275 #Protein linkage
|
|
276 translation = ""
|
|
277 try:
|
|
278 translation = feature.qualifiers["translation"][0].strip("*")
|
|
279 except KeyError:
|
|
280 #When protein sequence is not given...
|
|
281 if len(feature.location.parts) > 1:
|
|
282 #Exon boundaries?
|
|
283 seq = ''
|
|
284 for loc in feature.location:
|
|
285 seq += record.seq[loc]
|
|
286 if int(feature.location.strand) == -1:
|
|
287 seq = Seq(seq).complement()
|
|
288 else:
|
|
289 seq = Seq(seq)
|
|
290 translation = str(seq.translate(feature.qualifiers["transl_table"][0]))
|
|
291 elif int(feature.location.strand) == -1:
|
|
292 if str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon)).strip("*") != translation:
|
|
293 if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:
|
|
294 translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon))
|
|
295 else:
|
|
296 translation = ''
|
|
297 elif int(feature.location.strand) == +1:
|
|
298 if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:
|
|
299 translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].translate(codon))
|
|
300 else:
|
|
301 translation = ''
|
|
302
|
|
303 if translation:
|
|
304 translation = list(translation)
|
|
305 translation[0] = "M"
|
|
306 translation = ''.join(translation).strip("*")
|
|
307 if "*" in translation:
|
|
308 pass
|
|
309
|
|
310 translation = translation.encode('utf-8')
|
|
311 md5_protein = hashlib.md5(translation).hexdigest()
|
|
312 proteinURI = coreURI["protein/"+md5_protein]
|
|
313 genomeGraph.add((generalURI,coreURI["protein"],proteinURI))
|
|
314 for key in feature.qualifiers:
|
|
315 for v in feature.qualifiers[key]:
|
|
316 if key == "translation":
|
|
317 genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein)))
|
|
318 genomeGraph.add((proteinURI,coreURI["sequence"],Literal(translation)))
|
|
319 genomeGraph.add((proteinURI,RDF.type,proteinClass))
|
|
320 else:
|
|
321 for v in feature.qualifiers[key]:
|
|
322 int_add(generalURI,coreURI[key.lower()],v)
|
|
323
|
|
324 def int_add(subject, predicate, obj):
|
|
325 try:
|
|
326 object_float = float(obj.replace('"',''))
|
|
327 object_int = int(obj.replace('"',''))
|
|
328 if object_int == object_float:
|
|
329 genomeGraph.add((subject,predicate,Literal(object_int)))
|
|
330 else:
|
|
331 genomeGraph.add((subject,predicate,Literal(object_float)))
|
|
332 except:
|
|
333 genomeGraph.add((subject,predicate,Literal(obj.replace('"',''))))
|
|
334
|
|
335 def save():
|
|
336 data = genomeGraph.serialize(format='turtle')
|
|
337 open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
|
|
338
|
|
339 def subClassOfBuilder():
|
|
340 for subclass in SubClassOfDict:
|
|
341 genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))
|
|
342 genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Feature"]))
|
|
343
|
|
344 def subClassOfBuilderRna():
|
|
345 for subclass in SubClassOfDictRna:
|
|
346 genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))
|
|
347 genomeGraph.add((coreURI["Rna"],RDFS.subClassOf,coreURI["Feature"]))
|
|
348 genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"]))
|
|
349 genomeGraph.add((coreURI[subclass],RDF.type,OWL.Class))
|
|
350
|
|
351 def main():
|
|
352 tmp()
|
|
353 gbk_parser()
|
|
354 subClassOfBuilder()
|
|
355 subClassOfBuilderRna()
|
|
356 save()
|
|
357 cleantmp()
|
|
358
|
|
359 if __name__ == "__main__":
|
|
360 main() |