3
|
1 #!/usr/bin/env python3.4
|
|
2 # Author: Jasper Jan Koehorst
|
|
3 # Date created: Feb 21 2015
|
|
4 # Function: generation of a RDF file from Genbank/EMBL
|
|
5
|
|
6 import warnings
|
|
7 warnings.filterwarnings("ignore")
|
|
8
|
|
9 def delete_galaxy():
|
|
10 import sys
|
|
11 for index, path in enumerate(sys.path):
|
|
12 if "galaxy-dist/" in path:
|
|
13 sys.path[index] = ''
|
|
14
|
|
15 #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. This is not an elegant solution but it works for now.
|
|
16 delete_galaxy()
|
|
17
|
|
18 from Bio import SeqIO
|
|
19 # Import RDFLib's default Graph implementation.
|
|
20 import os, sys
|
|
21 from Bio.Seq import Seq
|
|
22
|
|
23 from rdflib import Graph, URIRef, Literal,Namespace,RDF,RDFS,OWL, plugin
|
|
24 from rdflib.store import Store
|
|
25 import hashlib
|
|
26 store = plugin.get('IOMemory', Store)()
|
|
27
|
|
28 global URI
|
|
29 URI = "http://csb.wur.nl/genome/"
|
|
30 global seeAlso
|
|
31 seeAlso = "rdfs:seeAlso"
|
|
32 global coreURI
|
|
33 coreURI = Namespace(URI)
|
|
34
|
|
35 global SubClassOfDict
|
|
36 SubClassOfDict = {}
|
|
37 global SubClassOfDictRna
|
|
38 SubClassOfDictRna = {}
|
|
39
|
|
40 def createClass(uri, root=True):
|
|
41 genomeGraph.add((uri,RDF.type,OWL.Class))
|
|
42 if root:
|
|
43 genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
|
|
44 return uri
|
|
45
|
|
46 def tmp():
|
|
47 import time
|
|
48 global tmpFolder
|
|
49 tmpFolder = "/tmp/"+str(time.time())+"/"
|
|
50 os.mkdir(tmpFolder)
|
|
51
|
|
52 def cleantmp():
|
|
53 os.system("ls "+tmpFolder)
|
|
54 os.system("rm -rf "+tmpFolder)
|
|
55
|
|
56 def crawler():
|
|
57 #From input folder it looks for GBK file (gz files are in progress)
|
|
58 input_file = sys.argv[sys.argv.index("-input")+1]
|
|
59 gbk_parser(input_file)
|
|
60
|
|
61 def gbk_parser():
|
|
62 prevObjStart = -1
|
|
63 prevObjStop = -1
|
|
64 store = plugin.get('IOMemory', Store)()
|
|
65 global genomeGraph
|
|
66 genomeGraph = Graph(store,URIRef(URI))
|
|
67 genomeGraph.bind("ssb",coreURI)
|
|
68 input_file = sys.argv[sys.argv.index("-input")+1]
|
|
69
|
|
70 #CLASS definitions
|
|
71 genomeClass = createClass(coreURI["Genome"], root=True)
|
|
72 typeClass = createClass(coreURI["DnaObject"], root=True)
|
|
73 createClass(coreURI["Protein"], root=True)
|
|
74 pubmedClass = createClass(coreURI["Pubmed"], root=True)
|
|
75 miscClass = createClass(coreURI["MiscFeature"], root=False)
|
|
76 createClass(coreURI["Feature"], root=True)
|
|
77 SubClassOfDict["MiscFeature"] = 1
|
|
78 SubClassOfDictRna["Trna"] = 1
|
|
79 SubClassOfDictRna["Rrna"] = 1
|
|
80 SubClassOfDictRna["Tmrna"] = 1
|
|
81 SubClassOfDictRna["Ncrna"] = 1
|
|
82
|
|
83 # codon = "11" #Default initialization if no CDS are present
|
|
84 ##################
|
|
85 weird_chars = list(''',./?<>:;"'|\}]{[+=_-)(*&^%$#@!±§~` ''')
|
|
86 scaf_value = 0
|
|
87 #Which files are already done
|
|
88 ########
|
|
89 formatGBK = sys.argv[sys.argv.index("-format")+1]
|
|
90 for record in SeqIO.parse(input_file, formatGBK):
|
|
91 #Read first feature for genome name and information...
|
|
92 #Ignore the empty GBK file due to the lack of features?
|
|
93
|
|
94 for index, feature in enumerate(record.features):
|
|
95 if index == 0:
|
|
96 if "-identifier" in sys.argv:
|
|
97 genome = sys.argv[sys.argv.index("-identifier")+1]
|
|
98 else:
|
|
99 try:
|
|
100 genome = feature.qualifiers["organism"][0].replace(" ","_")
|
|
101 except:
|
|
102 #BUG: THIS IS A TEMP FIX, USE GALAXY -IDENTIFIER TO CAPTURE THIS
|
|
103 genome = "XNoneX"
|
|
104 for char in weird_chars:
|
|
105 genome = genome.replace(char,"_")
|
|
106
|
|
107 try:
|
|
108 gi = record.annotations["gi"]
|
|
109 typ = str(gi)
|
|
110 except:
|
|
111 scaf_value += 1
|
|
112 typ = "scaffold_"+str(scaf_value)
|
|
113 genomeURI = coreURI[genome]
|
|
114 gbkURI = coreURI[genome + "/" + typ]
|
|
115 #To contig connection to connect all data to it
|
|
116 genomeGraph.add((genomeURI, coreURI["dnaobject"] , gbkURI))
|
|
117
|
|
118 #General genome features also stored in the class...
|
|
119 if "genome" in feature.qualifiers:
|
|
120 genomeGraph.add((genomeURI, coreURI["organism"],Literal(feature.qualifiers["organism"][0])))
|
|
121 if "strain" in feature.qualifiers:
|
|
122 genomeGraph.add((genomeURI, coreURI["strain"],Literal(feature.qualifiers["strain"][0])))
|
|
123 if "taxonomy" in record.annotations:
|
|
124 for taxon in record.annotations["taxonomy"]:
|
|
125 genomeGraph.add((genomeURI, coreURI["taxonomy"],Literal(taxon)))
|
|
126 record.annotations["taxonomy"] = []
|
|
127 #Genome sequence#
|
|
128 sequence = str(record.seq)
|
|
129 #Verify if sequence was not empty and is now full of X or N
|
|
130 filtered_sequence = sequence.replace("X","").replace("N","")
|
|
131 if len(filtered_sequence) == 0:
|
|
132 sequence = ""
|
|
133 #Record parsing#
|
|
134 for annot in record.annotations:
|
|
135 if type(record.annotations[annot]) == list:
|
|
136 if annot == "references":
|
|
137 for references in record.annotations[annot]:
|
|
138 if references.pubmed_id != "":
|
|
139 pubmed = references.pubmed_id
|
|
140 genomeGraph.add((gbkURI, coreURI[annot.lower()] , coreURI["pubmed/"+pubmed]))
|
|
141 obj_dict = references.__dict__
|
|
142 for key in obj_dict:
|
|
143 genomeGraph.add((coreURI["pubmed/"+pubmed], coreURI[key.lower()], Literal(str(obj_dict[key]))))
|
|
144 genomeGraph.add((coreURI["pubmed/"+pubmed], RDF.type, pubmedClass))
|
|
145
|
|
146 else:
|
|
147 for a in record.annotations[annot]:
|
|
148 int_add(gbkURI,coreURI[annot.lower()],str(a))
|
|
149 else:
|
|
150 int_add(gbkURI,coreURI[annot.lower()],str(record.annotations[annot]))
|
|
151
|
|
152
|
|
153 #####END of RECORD####
|
|
154 if len(sequence) > 0:
|
|
155 genomeGraph.add((gbkURI, coreURI["sequence"] , Literal(sequence)))
|
|
156 genomeGraph.add((genomeURI, RDF.type,genomeClass))
|
|
157 genomeGraph.add((gbkURI, RDF.type,typeClass))
|
|
158 for key in feature.qualifiers:
|
|
159 genomeGraph.add((gbkURI, coreURI[key.lower()] , Literal(feature.qualifiers[key][0])))
|
|
160 #break
|
|
161 else: #The rest of the GBK file
|
|
162 feature_type = feature.type
|
|
163 end = str(feature.location.end).replace(">","").replace("<","")
|
|
164 start = str(feature.location.start).replace(">","").replace("<","")
|
|
165
|
|
166 strand = str(feature.location.strand)
|
|
167
|
|
168 if strand == 'None':
|
|
169 strand = 0
|
|
170
|
|
171 # if feature_type == "gene":
|
|
172 # gene = feature
|
|
173 #Store gene in next feature....
|
|
174 # gene_location_start = end = str(gene.location.end).replace(">","").replace("<","")
|
|
175 # gene_location_stop = str(gene.location.start).replace(">","").replace("<","")
|
|
176 # gene_qualifiers = gene.qualifiers
|
|
177 else:
|
|
178 if feature.type == "misc_feature": #Store as part of previous cds or something...
|
|
179 if strand == "-1":
|
|
180 miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(end)+"_"+str(start)]
|
|
181 else:
|
|
182 miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(start)+"_"+str(end)]
|
|
183
|
|
184 # genomeGraph.add((generalURI,coreURI["subFeature"],miscURI))
|
|
185
|
|
186 # TODO: Check if biopython has an overlap function...
|
|
187 if int(prevObjStart) <= int(start):
|
|
188 if int(end) <= int(prevObjStop):
|
|
189 pass
|
|
190 # genomeGraph.add((typeURI,coreURI["feature"],miscURI))
|
|
191 # genomeGraph.add((miscURI,RDF.type,miscClass))
|
|
192 else:
|
|
193 genomeGraph.add((gbkURI, coreURI["feature"] , miscURI))
|
|
194 genomeGraph.add((miscURI,RDF.type,miscClass))
|
|
195 else:
|
|
196 genomeGraph.add((gbkURI, coreURI["feature"] , miscURI))
|
|
197 genomeGraph.add((miscURI,RDF.type,miscClass))
|
|
198
|
|
199 store_general_information(miscURI,feature,record)
|
|
200 else:
|
|
201 prevObjStart = start
|
|
202 prevObjStop = end
|
|
203
|
|
204
|
|
205 if strand == "-1":
|
|
206 typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(end)+"_"+str(start)]
|
|
207 else:
|
|
208 typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(start)+"_"+str(end)]
|
|
209
|
|
210 # cds_sequence = str(feature.extract(sequence))
|
|
211 #Contig specific connection
|
|
212
|
|
213 genomeGraph.add((gbkURI, coreURI["feature"] , typeURI))
|
|
214 ############################
|
|
215
|
|
216 store_general_information(typeURI,feature,record)
|
|
217
|
|
218 for subfeature in feature.sub_features:
|
|
219 strand = str(subfeature.location.strand)
|
|
220 subfeature_type = subfeature.type
|
|
221 end = str(subfeature.location.end).replace(">","").replace("<","")
|
|
222 start = str(subfeature.location.start).replace(">","").replace("<","")
|
|
223
|
|
224 if strand == "-1":
|
|
225 subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(end)+"_"+str(start)]
|
|
226 else:
|
|
227 subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(start)+"_"+str(end)]
|
|
228 genomeGraph.add((typeURI, coreURI["feature"] , subURI))
|
|
229 store_general_information(subURI,subfeature,record,feature)
|
|
230
|
|
231 def store_general_information(generalURI,feature,record,superfeature=""):
|
|
232 proteinClass = createClass(coreURI["Protein"], root=True)
|
|
233 sequence = str(record.seq)
|
|
234 cds_sequence = str(feature.extract(sequence))
|
|
235 #Fixes the 0 count instead of 1-count in biopython vs humans
|
|
236 feature_type = feature.type
|
|
237 end = str(feature.location.end).replace(">","").replace("<","")
|
|
238 start = str(feature.location.start).replace(">","").replace("<","")
|
|
239 strand = str(feature.location.strand)
|
|
240 if strand == "None":
|
|
241 strand = 0
|
|
242
|
|
243 genomeGraph.add((generalURI,coreURI["sourcedb"],Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
|
|
244
|
|
245 if strand == "-1":
|
|
246 genomeGraph.add((generalURI,coreURI["end"],Literal(int(start)+1)))
|
|
247 genomeGraph.add((generalURI,coreURI["begin"],Literal(int(end))))
|
|
248 else:
|
|
249 genomeGraph.add((generalURI,coreURI["begin"],Literal(int(start)+1)))
|
|
250 genomeGraph.add((generalURI,coreURI["end"],Literal(int(end))))
|
|
251 genomeGraph.add((generalURI,coreURI["strand"],Literal(int(strand))))
|
|
252 if feature.type != "misc_feature":
|
|
253 try:
|
|
254 genomeGraph.add((generalURI,coreURI["sequence"],Literal(cds_sequence)))
|
|
255 except: #When protein sequence is not given for whatever reason
|
|
256 print ("wrong?")
|
|
257
|
|
258 if feature.type == "misc_feature":
|
|
259 pass
|
|
260 else:
|
|
261 genomeGraph.add((generalURI,RDF.type,createClass(coreURI[feature_type.lower().title()], root=False)))
|
|
262 if feature_type.lower() != "rrna" and feature_type.lower() != "trna" and feature_type.lower() != "tmrna" and feature_type.lower() != "ncrna":
|
|
263 SubClassOfDict[feature_type.lower().title()] = 1
|
|
264 for key in feature.qualifiers:
|
|
265 values = feature.qualifiers[key]
|
|
266 if key == "translation":
|
|
267 pass
|
|
268 elif type(values) == list:
|
|
269 for v in values:
|
|
270 int_add(generalURI,coreURI[key.lower()],v)
|
|
271 else:
|
|
272 int_add(generalURI,coreURI[key.lower()],values)
|
|
273 if feature.type == "CDS":
|
|
274 try:
|
|
275 #Feature is normally submitted to this function
|
|
276 #IF a subfeature is submitted it is submitted as a feature
|
|
277 #And subfeature variable will contain the superfeature
|
|
278 if superfeature:
|
|
279 codon = superfeature.qualifiers["transl_table"][0]
|
|
280 # else:
|
|
281 # codon = subfeature.qualifiers["transl_table"][0]
|
|
282 except:
|
|
283 #Default codon table 11
|
|
284 codon = "11"
|
|
285 #Protein linkage
|
|
286 translation = ""
|
|
287 try:
|
|
288 translation = feature.qualifiers["translation"][0].strip("*")
|
|
289 except KeyError:
|
|
290 #When protein sequence is not given...
|
|
291 if len(feature.location.parts) > 1:
|
|
292 #Exon boundaries?
|
|
293 seq = ''
|
|
294 for loc in feature.location:
|
|
295 seq += record.seq[loc]
|
|
296 if int(feature.location.strand) == -1:
|
|
297 seq = Seq(seq).complement()
|
|
298 else:
|
|
299 seq = Seq(seq)
|
|
300 translation = str(seq.translate(feature.qualifiers["transl_table"][0]))
|
|
301 elif int(feature.location.strand) == -1:
|
|
302 if str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon)).strip("*") != translation:
|
|
303 if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:
|
|
304 translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon))
|
|
305 else:
|
|
306 translation = ''
|
|
307 elif int(feature.location.strand) == +1:
|
|
308 if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:
|
|
309 translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].translate(codon))
|
|
310 else:
|
|
311 translation = ''
|
|
312
|
|
313 if translation:
|
|
314 translation = list(translation)
|
|
315 translation[0] = "M"
|
|
316 translation = ''.join(translation).strip("*")
|
|
317 if "*" in translation:
|
|
318 pass
|
|
319
|
|
320 translation = translation.encode('utf-8')
|
|
321 md5_protein = hashlib.md5(translation).hexdigest()
|
|
322 proteinURI = coreURI["protein/"+md5_protein]
|
|
323 genomeGraph.add((generalURI,coreURI["protein"],proteinURI))
|
|
324 for key in feature.qualifiers:
|
|
325 for v in feature.qualifiers[key]:
|
|
326 if key == "translation":
|
|
327 genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein)))
|
|
328 genomeGraph.add((proteinURI,coreURI["sequence"],Literal(translation)))
|
|
329 genomeGraph.add((proteinURI,RDF.type,proteinClass))
|
|
330 else:
|
|
331 for v in feature.qualifiers[key]:
|
|
332 int_add(generalURI,coreURI[key.lower()],v)
|
|
333
|
|
334 def int_add(subject, predicate, obj):
|
|
335 try:
|
|
336 object_float = float(obj.replace('"',''))
|
|
337 object_int = int(obj.replace('"',''))
|
|
338 if object_int == object_float:
|
|
339 genomeGraph.add((subject,predicate,Literal(object_int)))
|
|
340 else:
|
|
341 genomeGraph.add((subject,predicate,Literal(object_float)))
|
|
342 except:
|
|
343 genomeGraph.add((subject,predicate,Literal(obj.replace('"',''))))
|
|
344
|
|
345 def save():
|
|
346 data = genomeGraph.serialize(format='turtle')
|
|
347 open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
|
|
348
|
|
349 def subClassOfBuilder():
|
|
350 for subclass in SubClassOfDict:
|
|
351 genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))
|
|
352 genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Feature"]))
|
|
353
|
|
354 def subClassOfBuilderRna():
|
|
355 for subclass in SubClassOfDictRna:
|
|
356 genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))
|
|
357 genomeGraph.add((coreURI["Rna"],RDFS.subClassOf,coreURI["Feature"]))
|
|
358 genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"]))
|
|
359 genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"]))
|
|
360 genomeGraph.add((coreURI[subclass],RDF.type,OWL.Class))
|
|
361
|
|
362 def main():
|
|
363 tmp()
|
|
364 gbk_parser()
|
|
365 subClassOfBuilder()
|
|
366 subClassOfBuilderRna()
|
|
367 save()
|
|
368 cleantmp()
|
|
369
|
|
370 if __name__ == "__main__":
|
|
371 main() |