comparison gbk2rdf/gbktordf.py @ 3:db04e12b8779

Uploaded
author jjkoehorst
date Sat, 21 Feb 2015 07:28:39 -0500
parents
children ec73c34af97b
comparison
equal deleted inserted replaced
2:89be6bd55b4c 3:db04e12b8779
1 #!/usr/bin/env python3.4
2 # Author: Jasper Jan Koehorst
3 # Date created: Feb 21 2015
4 # Function: generation of a RDF file from Genbank/EMBL
5
6 import warnings
7 warnings.filterwarnings("ignore")
8
9 def delete_galaxy():
10 import sys
11 for index, path in enumerate(sys.path):
12 if "galaxy-dist/" in path:
13 sys.path[index] = ''
14
15 #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. This is not an elegant solution but it works for now.
16 delete_galaxy()
17
18 from Bio import SeqIO
19 # Import RDFLib's default Graph implementation.
20 import os, sys
21 from Bio.Seq import Seq
22
23 from rdflib import Graph, URIRef, Literal,Namespace,RDF,RDFS,OWL, plugin
24 from rdflib.store import Store
25 import hashlib
26 store = plugin.get('IOMemory', Store)()
27
28 global URI
29 URI = "http://csb.wur.nl/genome/"
30 global seeAlso
31 seeAlso = "rdfs:seeAlso"
32 global coreURI
33 coreURI = Namespace(URI)
34
35 global SubClassOfDict
36 SubClassOfDict = {}
37 global SubClassOfDictRna
38 SubClassOfDictRna = {}
39
40 def createClass(uri, root=True):
41 genomeGraph.add((uri,RDF.type,OWL.Class))
42 if root:
43 genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
44 return uri
45
46 def tmp():
47 import time
48 global tmpFolder
49 tmpFolder = "/tmp/"+str(time.time())+"/"
50 os.mkdir(tmpFolder)
51
52 def cleantmp():
53 os.system("ls "+tmpFolder)
54 os.system("rm -rf "+tmpFolder)
55
56 def crawler():
57 #From input folder it looks for GBK file (gz files are in progress)
58 input_file = sys.argv[sys.argv.index("-input")+1]
59 gbk_parser(input_file)
60
61 def gbk_parser():
62 prevObjStart = -1
63 prevObjStop = -1
64 store = plugin.get('IOMemory', Store)()
65 global genomeGraph
66 genomeGraph = Graph(store,URIRef(URI))
67 genomeGraph.bind("ssb",coreURI)
68 input_file = sys.argv[sys.argv.index("-input")+1]
69
70 #CLASS definitions
71 genomeClass = createClass(coreURI["Genome"], root=True)
72 typeClass = createClass(coreURI["DnaObject"], root=True)
73 createClass(coreURI["Protein"], root=True)
74 pubmedClass = createClass(coreURI["Pubmed"], root=True)
75 miscClass = createClass(coreURI["MiscFeature"], root=False)
76 createClass(coreURI["Feature"], root=True)
77 SubClassOfDict["MiscFeature"] = 1
78 SubClassOfDictRna["Trna"] = 1
79 SubClassOfDictRna["Rrna"] = 1
80 SubClassOfDictRna["Tmrna"] = 1
81 SubClassOfDictRna["Ncrna"] = 1
82
83 # codon = "11" #Default initialization if no CDS are present
84 ##################
85 weird_chars = list(''',./?<>:;"'|\}]{[+=_-)(*&^%$#@!±§~` ''')
86 scaf_value = 0
87 #Which files are already done
88 ########
89 formatGBK = sys.argv[sys.argv.index("-format")+1]
90 for record in SeqIO.parse(input_file, formatGBK):
91 #Read first feature for genome name and information...
92 #Ignore the empty GBK file due to the lack of features?
93
94 for index, feature in enumerate(record.features):
95 if index == 0:
96 if "-identifier" in sys.argv:
97 genome = sys.argv[sys.argv.index("-identifier")+1]
98 else:
99 try:
100 genome = feature.qualifiers["organism"][0].replace(" ","_")
101 except:
102 #BUG: THIS IS A TEMP FIX, USE GALAXY -IDENTIFIER TO CAPTURE THIS
103 genome = "XNoneX"
104 for char in weird_chars:
105 genome = genome.replace(char,"_")
106
107 try:
108 gi = record.annotations["gi"]
109 typ = str(gi)
110 except:
111 scaf_value += 1
112 typ = "scaffold_"+str(scaf_value)
113 genomeURI = coreURI[genome]
114 gbkURI = coreURI[genome + "/" + typ]
115 #To contig connection to connect all data to it
116 genomeGraph.add((genomeURI, coreURI["dnaobject"] , gbkURI))
117
118 #General genome features also stored in the class...
119 if "genome" in feature.qualifiers:
120 genomeGraph.add((genomeURI, coreURI["organism"],Literal(feature.qualifiers["organism"][0])))
121 if "strain" in feature.qualifiers:
122 genomeGraph.add((genomeURI, coreURI["strain"],Literal(feature.qualifiers["strain"][0])))
123 if "taxonomy" in record.annotations:
124 for taxon in record.annotations["taxonomy"]:
125 genomeGraph.add((genomeURI, coreURI["taxonomy"],Literal(taxon)))
126 record.annotations["taxonomy"] = []
127 #Genome sequence#
128 sequence = str(record.seq)
129 #Verify if sequence was not empty and is now full of X or N
130 filtered_sequence = sequence.replace("X","").replace("N","")
131 if len(filtered_sequence) == 0:
132 sequence = ""
133 #Record parsing#
134 for annot in record.annotations:
135 if type(record.annotations[annot]) == list:
136 if annot == "references":
137 for references in record.annotations[annot]:
138 if references.pubmed_id != "":
139 pubmed = references.pubmed_id
140 genomeGraph.add((gbkURI, coreURI[annot.lower()] , coreURI["pubmed/"+pubmed]))
141 obj_dict = references.__dict__
142 for key in obj_dict:
143 genomeGraph.add((coreURI["pubmed/"+pubmed], coreURI[key.lower()], Literal(str(obj_dict[key]))))
144 genomeGraph.add((coreURI["pubmed/"+pubmed], RDF.type, pubmedClass))
145
146 else:
147 for a in record.annotations[annot]:
148 int_add(gbkURI,coreURI[annot.lower()],str(a))
149 else:
150 int_add(gbkURI,coreURI[annot.lower()],str(record.annotations[annot]))
151
152
153 #####END of RECORD####
154 if len(sequence) > 0:
155 genomeGraph.add((gbkURI, coreURI["sequence"] , Literal(sequence)))
156 genomeGraph.add((genomeURI, RDF.type,genomeClass))
157 genomeGraph.add((gbkURI, RDF.type,typeClass))
158 for key in feature.qualifiers:
159 genomeGraph.add((gbkURI, coreURI[key.lower()] , Literal(feature.qualifiers[key][0])))
160 #break
161 else: #The rest of the GBK file
162 feature_type = feature.type
163 end = str(feature.location.end).replace(">","").replace("<","")
164 start = str(feature.location.start).replace(">","").replace("<","")
165
166 strand = str(feature.location.strand)
167
168 if strand == 'None':
169 strand = 0
170
171 # if feature_type == "gene":
172 # gene = feature
173 #Store gene in next feature....
174 # gene_location_start = end = str(gene.location.end).replace(">","").replace("<","")
175 # gene_location_stop = str(gene.location.start).replace(">","").replace("<","")
176 # gene_qualifiers = gene.qualifiers
177 else:
178 if feature.type == "misc_feature": #Store as part of previous cds or something...
179 if strand == "-1":
180 miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(end)+"_"+str(start)]
181 else:
182 miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(start)+"_"+str(end)]
183
184 # genomeGraph.add((generalURI,coreURI["subFeature"],miscURI))
185
186 # TODO: Check if biopython has an overlap function...
187 if int(prevObjStart) <= int(start):
188 if int(end) <= int(prevObjStop):
189 pass
190 # genomeGraph.add((typeURI,coreURI["feature"],miscURI))
191 # genomeGraph.add((miscURI,RDF.type,miscClass))
192 else:
193 genomeGraph.add((gbkURI, coreURI["feature"] , miscURI))
194 genomeGraph.add((miscURI,RDF.type,miscClass))
195 else:
196 genomeGraph.add((gbkURI, coreURI["feature"] , miscURI))
197 genomeGraph.add((miscURI,RDF.type,miscClass))
198
199 store_general_information(miscURI,feature,record)
200 else:
201 prevObjStart = start
202 prevObjStop = end
203
204
205 if strand == "-1":
206 typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(end)+"_"+str(start)]
207 else:
208 typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(start)+"_"+str(end)]
209
210 # cds_sequence = str(feature.extract(sequence))
211 #Contig specific connection
212
213 genomeGraph.add((gbkURI, coreURI["feature"] , typeURI))
214 ############################
215
216 store_general_information(typeURI,feature,record)
217
218 for subfeature in feature.sub_features:
219 strand = str(subfeature.location.strand)
220 subfeature_type = subfeature.type
221 end = str(subfeature.location.end).replace(">","").replace("<","")
222 start = str(subfeature.location.start).replace(">","").replace("<","")
223
224 if strand == "-1":
225 subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(end)+"_"+str(start)]
226 else:
227 subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(start)+"_"+str(end)]
228 genomeGraph.add((typeURI, coreURI["feature"] , subURI))
229 store_general_information(subURI,subfeature,record,feature)
230
231 def store_general_information(generalURI,feature,record,superfeature=""):
232 proteinClass = createClass(coreURI["Protein"], root=True)
233 sequence = str(record.seq)
234 cds_sequence = str(feature.extract(sequence))
235 #Fixes the 0 count instead of 1-count in biopython vs humans
236 feature_type = feature.type
237 end = str(feature.location.end).replace(">","").replace("<","")
238 start = str(feature.location.start).replace(">","").replace("<","")
239 strand = str(feature.location.strand)
240 if strand == "None":
241 strand = 0
242
243 genomeGraph.add((generalURI,coreURI["sourcedb"],Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
244
245 if strand == "-1":
246 genomeGraph.add((generalURI,coreURI["end"],Literal(int(start)+1)))
247 genomeGraph.add((generalURI,coreURI["begin"],Literal(int(end))))
248 else:
249 genomeGraph.add((generalURI,coreURI["begin"],Literal(int(start)+1)))
250 genomeGraph.add((generalURI,coreURI["end"],Literal(int(end))))
251 genomeGraph.add((generalURI,coreURI["strand"],Literal(int(strand))))
252 if feature.type != "misc_feature":
253 try:
254 genomeGraph.add((generalURI,coreURI["sequence"],Literal(cds_sequence)))
255 except: #When protein sequence is not given for whatever reason
256 print ("wrong?")
257
258 if feature.type == "misc_feature":
259 pass
260 else:
261 genomeGraph.add((generalURI,RDF.type,createClass(coreURI[feature_type.lower().title()], root=False)))
262 if feature_type.lower() != "rrna" and feature_type.lower() != "trna" and feature_type.lower() != "tmrna" and feature_type.lower() != "ncrna":
263 SubClassOfDict[feature_type.lower().title()] = 1
264 for key in feature.qualifiers:
265 values = feature.qualifiers[key]
266 if key == "translation":
267 pass
268 elif type(values) == list:
269 for v in values:
270 int_add(generalURI,coreURI[key.lower()],v)
271 else:
272 int_add(generalURI,coreURI[key.lower()],values)
273 if feature.type == "CDS":
274 try:
275 #Feature is normally submitted to this function
276 #IF a subfeature is submitted it is submitted as a feature
277 #And subfeature variable will contain the superfeature
278 if superfeature:
279 codon = superfeature.qualifiers["transl_table"][0]
280 # else:
281 # codon = subfeature.qualifiers["transl_table"][0]
282 except:
283 #Default codon table 11
284 codon = "11"
285 #Protein linkage
286 translation = ""
287 try:
288 translation = feature.qualifiers["translation"][0].strip("*")
289 except KeyError:
290 #When protein sequence is not given...
291 if len(feature.location.parts) > 1:
292 #Exon boundaries?
293 seq = ''
294 for loc in feature.location:
295 seq += record.seq[loc]
296 if int(feature.location.strand) == -1:
297 seq = Seq(seq).complement()
298 else:
299 seq = Seq(seq)
300 translation = str(seq.translate(feature.qualifiers["transl_table"][0]))
301 elif int(feature.location.strand) == -1:
302 if str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon)).strip("*") != translation:
303 if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:
304 translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon))
305 else:
306 translation = ''
307 elif int(feature.location.strand) == +1:
308 if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:
309 translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].translate(codon))
310 else:
311 translation = ''
312
313 if translation:
314 translation = list(translation)
315 translation[0] = "M"
316 translation = ''.join(translation).strip("*")
317 if "*" in translation:
318 pass
319
320 translation = translation.encode('utf-8')
321 md5_protein = hashlib.md5(translation).hexdigest()
322 proteinURI = coreURI["protein/"+md5_protein]
323 genomeGraph.add((generalURI,coreURI["protein"],proteinURI))
324 for key in feature.qualifiers:
325 for v in feature.qualifiers[key]:
326 if key == "translation":
327 genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein)))
328 genomeGraph.add((proteinURI,coreURI["sequence"],Literal(translation)))
329 genomeGraph.add((proteinURI,RDF.type,proteinClass))
330 else:
331 for v in feature.qualifiers[key]:
332 int_add(generalURI,coreURI[key.lower()],v)
333
334 def int_add(subject, predicate, obj):
335 try:
336 object_float = float(obj.replace('"',''))
337 object_int = int(obj.replace('"',''))
338 if object_int == object_float:
339 genomeGraph.add((subject,predicate,Literal(object_int)))
340 else:
341 genomeGraph.add((subject,predicate,Literal(object_float)))
342 except:
343 genomeGraph.add((subject,predicate,Literal(obj.replace('"',''))))
344
345 def save():
346 data = genomeGraph.serialize(format='turtle')
347 open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
348
349 def subClassOfBuilder():
350 for subclass in SubClassOfDict:
351 genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))
352 genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Feature"]))
353
354 def subClassOfBuilderRna():
355 for subclass in SubClassOfDictRna:
356 genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))
357 genomeGraph.add((coreURI["Rna"],RDFS.subClassOf,coreURI["Feature"]))
358 genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"]))
359 genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"]))
360 genomeGraph.add((coreURI[subclass],RDF.type,OWL.Class))
361
362 def main():
363 tmp()
364 gbk_parser()
365 subClassOfBuilder()
366 subClassOfBuilderRna()
367 save()
368 cleantmp()
369
370 if __name__ == "__main__":
371 main()