Mercurial > repos > jjkoehorst > sapp
comparison gbk2rdf/gbktordf.py @ 3:db04e12b8779
Uploaded
author | jjkoehorst |
---|---|
date | Sat, 21 Feb 2015 07:28:39 -0500 |
parents | |
children | ec73c34af97b |
comparison
equal
deleted
inserted
replaced
2:89be6bd55b4c | 3:db04e12b8779 |
---|---|
1 #!/usr/bin/env python3.4 | |
2 # Author: Jasper Jan Koehorst | |
3 # Date created: Feb 21 2015 | |
4 # Function: generation of a RDF file from Genbank/EMBL | |
5 | |
6 import warnings | |
7 warnings.filterwarnings("ignore") | |
8 | |
9 def delete_galaxy(): | |
10 import sys | |
11 for index, path in enumerate(sys.path): | |
12 if "galaxy-dist/" in path: | |
13 sys.path[index] = '' | |
14 | |
15 #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. This is not an elegant solution but it works for now. | |
16 delete_galaxy() | |
17 | |
18 from Bio import SeqIO | |
19 # Import RDFLib's default Graph implementation. | |
20 import os, sys | |
21 from Bio.Seq import Seq | |
22 | |
23 from rdflib import Graph, URIRef, Literal,Namespace,RDF,RDFS,OWL, plugin | |
24 from rdflib.store import Store | |
25 import hashlib | |
26 store = plugin.get('IOMemory', Store)() | |
27 | |
28 global URI | |
29 URI = "http://csb.wur.nl/genome/" | |
30 global seeAlso | |
31 seeAlso = "rdfs:seeAlso" | |
32 global coreURI | |
33 coreURI = Namespace(URI) | |
34 | |
35 global SubClassOfDict | |
36 SubClassOfDict = {} | |
37 global SubClassOfDictRna | |
38 SubClassOfDictRna = {} | |
39 | |
40 def createClass(uri, root=True): | |
41 genomeGraph.add((uri,RDF.type,OWL.Class)) | |
42 if root: | |
43 genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing)) | |
44 return uri | |
45 | |
46 def tmp(): | |
47 import time | |
48 global tmpFolder | |
49 tmpFolder = "/tmp/"+str(time.time())+"/" | |
50 os.mkdir(tmpFolder) | |
51 | |
52 def cleantmp(): | |
53 os.system("ls "+tmpFolder) | |
54 os.system("rm -rf "+tmpFolder) | |
55 | |
56 def crawler(): | |
57 #From input folder it looks for GBK file (gz files are in progress) | |
58 input_file = sys.argv[sys.argv.index("-input")+1] | |
59 gbk_parser(input_file) | |
60 | |
61 def gbk_parser(): | |
62 prevObjStart = -1 | |
63 prevObjStop = -1 | |
64 store = plugin.get('IOMemory', Store)() | |
65 global genomeGraph | |
66 genomeGraph = Graph(store,URIRef(URI)) | |
67 genomeGraph.bind("ssb",coreURI) | |
68 input_file = sys.argv[sys.argv.index("-input")+1] | |
69 | |
70 #CLASS definitions | |
71 genomeClass = createClass(coreURI["Genome"], root=True) | |
72 typeClass = createClass(coreURI["DnaObject"], root=True) | |
73 createClass(coreURI["Protein"], root=True) | |
74 pubmedClass = createClass(coreURI["Pubmed"], root=True) | |
75 miscClass = createClass(coreURI["MiscFeature"], root=False) | |
76 createClass(coreURI["Feature"], root=True) | |
77 SubClassOfDict["MiscFeature"] = 1 | |
78 SubClassOfDictRna["Trna"] = 1 | |
79 SubClassOfDictRna["Rrna"] = 1 | |
80 SubClassOfDictRna["Tmrna"] = 1 | |
81 SubClassOfDictRna["Ncrna"] = 1 | |
82 | |
83 # codon = "11" #Default initialization if no CDS are present | |
84 ################## | |
85 weird_chars = list(''',./?<>:;"'|\}]{[+=_-)(*&^%$#@!±§~` ''') | |
86 scaf_value = 0 | |
87 #Which files are already done | |
88 ######## | |
89 formatGBK = sys.argv[sys.argv.index("-format")+1] | |
90 for record in SeqIO.parse(input_file, formatGBK): | |
91 #Read first feature for genome name and information... | |
92 #Ignore the empty GBK file due to the lack of features? | |
93 | |
94 for index, feature in enumerate(record.features): | |
95 if index == 0: | |
96 if "-identifier" in sys.argv: | |
97 genome = sys.argv[sys.argv.index("-identifier")+1] | |
98 else: | |
99 try: | |
100 genome = feature.qualifiers["organism"][0].replace(" ","_") | |
101 except: | |
102 #BUG: THIS IS A TEMP FIX, USE GALAXY -IDENTIFIER TO CAPTURE THIS | |
103 genome = "XNoneX" | |
104 for char in weird_chars: | |
105 genome = genome.replace(char,"_") | |
106 | |
107 try: | |
108 gi = record.annotations["gi"] | |
109 typ = str(gi) | |
110 except: | |
111 scaf_value += 1 | |
112 typ = "scaffold_"+str(scaf_value) | |
113 genomeURI = coreURI[genome] | |
114 gbkURI = coreURI[genome + "/" + typ] | |
115 #To contig connection to connect all data to it | |
116 genomeGraph.add((genomeURI, coreURI["dnaobject"] , gbkURI)) | |
117 | |
118 #General genome features also stored in the class... | |
119 if "genome" in feature.qualifiers: | |
120 genomeGraph.add((genomeURI, coreURI["organism"],Literal(feature.qualifiers["organism"][0]))) | |
121 if "strain" in feature.qualifiers: | |
122 genomeGraph.add((genomeURI, coreURI["strain"],Literal(feature.qualifiers["strain"][0]))) | |
123 if "taxonomy" in record.annotations: | |
124 for taxon in record.annotations["taxonomy"]: | |
125 genomeGraph.add((genomeURI, coreURI["taxonomy"],Literal(taxon))) | |
126 record.annotations["taxonomy"] = [] | |
127 #Genome sequence# | |
128 sequence = str(record.seq) | |
129 #Verify if sequence was not empty and is now full of X or N | |
130 filtered_sequence = sequence.replace("X","").replace("N","") | |
131 if len(filtered_sequence) == 0: | |
132 sequence = "" | |
133 #Record parsing# | |
134 for annot in record.annotations: | |
135 if type(record.annotations[annot]) == list: | |
136 if annot == "references": | |
137 for references in record.annotations[annot]: | |
138 if references.pubmed_id != "": | |
139 pubmed = references.pubmed_id | |
140 genomeGraph.add((gbkURI, coreURI[annot.lower()] , coreURI["pubmed/"+pubmed])) | |
141 obj_dict = references.__dict__ | |
142 for key in obj_dict: | |
143 genomeGraph.add((coreURI["pubmed/"+pubmed], coreURI[key.lower()], Literal(str(obj_dict[key])))) | |
144 genomeGraph.add((coreURI["pubmed/"+pubmed], RDF.type, pubmedClass)) | |
145 | |
146 else: | |
147 for a in record.annotations[annot]: | |
148 int_add(gbkURI,coreURI[annot.lower()],str(a)) | |
149 else: | |
150 int_add(gbkURI,coreURI[annot.lower()],str(record.annotations[annot])) | |
151 | |
152 | |
153 #####END of RECORD#### | |
154 if len(sequence) > 0: | |
155 genomeGraph.add((gbkURI, coreURI["sequence"] , Literal(sequence))) | |
156 genomeGraph.add((genomeURI, RDF.type,genomeClass)) | |
157 genomeGraph.add((gbkURI, RDF.type,typeClass)) | |
158 for key in feature.qualifiers: | |
159 genomeGraph.add((gbkURI, coreURI[key.lower()] , Literal(feature.qualifiers[key][0]))) | |
160 #break | |
161 else: #The rest of the GBK file | |
162 feature_type = feature.type | |
163 end = str(feature.location.end).replace(">","").replace("<","") | |
164 start = str(feature.location.start).replace(">","").replace("<","") | |
165 | |
166 strand = str(feature.location.strand) | |
167 | |
168 if strand == 'None': | |
169 strand = 0 | |
170 | |
171 # if feature_type == "gene": | |
172 # gene = feature | |
173 #Store gene in next feature.... | |
174 # gene_location_start = end = str(gene.location.end).replace(">","").replace("<","") | |
175 # gene_location_stop = str(gene.location.start).replace(">","").replace("<","") | |
176 # gene_qualifiers = gene.qualifiers | |
177 else: | |
178 if feature.type == "misc_feature": #Store as part of previous cds or something... | |
179 if strand == "-1": | |
180 miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(end)+"_"+str(start)] | |
181 else: | |
182 miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(start)+"_"+str(end)] | |
183 | |
184 # genomeGraph.add((generalURI,coreURI["subFeature"],miscURI)) | |
185 | |
186 # TODO: Check if biopython has an overlap function... | |
187 if int(prevObjStart) <= int(start): | |
188 if int(end) <= int(prevObjStop): | |
189 pass | |
190 # genomeGraph.add((typeURI,coreURI["feature"],miscURI)) | |
191 # genomeGraph.add((miscURI,RDF.type,miscClass)) | |
192 else: | |
193 genomeGraph.add((gbkURI, coreURI["feature"] , miscURI)) | |
194 genomeGraph.add((miscURI,RDF.type,miscClass)) | |
195 else: | |
196 genomeGraph.add((gbkURI, coreURI["feature"] , miscURI)) | |
197 genomeGraph.add((miscURI,RDF.type,miscClass)) | |
198 | |
199 store_general_information(miscURI,feature,record) | |
200 else: | |
201 prevObjStart = start | |
202 prevObjStop = end | |
203 | |
204 | |
205 if strand == "-1": | |
206 typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(end)+"_"+str(start)] | |
207 else: | |
208 typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(start)+"_"+str(end)] | |
209 | |
210 # cds_sequence = str(feature.extract(sequence)) | |
211 #Contig specific connection | |
212 | |
213 genomeGraph.add((gbkURI, coreURI["feature"] , typeURI)) | |
214 ############################ | |
215 | |
216 store_general_information(typeURI,feature,record) | |
217 | |
218 for subfeature in feature.sub_features: | |
219 strand = str(subfeature.location.strand) | |
220 subfeature_type = subfeature.type | |
221 end = str(subfeature.location.end).replace(">","").replace("<","") | |
222 start = str(subfeature.location.start).replace(">","").replace("<","") | |
223 | |
224 if strand == "-1": | |
225 subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(end)+"_"+str(start)] | |
226 else: | |
227 subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(start)+"_"+str(end)] | |
228 genomeGraph.add((typeURI, coreURI["feature"] , subURI)) | |
229 store_general_information(subURI,subfeature,record,feature) | |
230 | |
231 def store_general_information(generalURI,feature,record,superfeature=""): | |
232 proteinClass = createClass(coreURI["Protein"], root=True) | |
233 sequence = str(record.seq) | |
234 cds_sequence = str(feature.extract(sequence)) | |
235 #Fixes the 0 count instead of 1-count in biopython vs humans | |
236 feature_type = feature.type | |
237 end = str(feature.location.end).replace(">","").replace("<","") | |
238 start = str(feature.location.start).replace(">","").replace("<","") | |
239 strand = str(feature.location.strand) | |
240 if strand == "None": | |
241 strand = 0 | |
242 | |
243 genomeGraph.add((generalURI,coreURI["sourcedb"],Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) | |
244 | |
245 if strand == "-1": | |
246 genomeGraph.add((generalURI,coreURI["end"],Literal(int(start)+1))) | |
247 genomeGraph.add((generalURI,coreURI["begin"],Literal(int(end)))) | |
248 else: | |
249 genomeGraph.add((generalURI,coreURI["begin"],Literal(int(start)+1))) | |
250 genomeGraph.add((generalURI,coreURI["end"],Literal(int(end)))) | |
251 genomeGraph.add((generalURI,coreURI["strand"],Literal(int(strand)))) | |
252 if feature.type != "misc_feature": | |
253 try: | |
254 genomeGraph.add((generalURI,coreURI["sequence"],Literal(cds_sequence))) | |
255 except: #When protein sequence is not given for whatever reason | |
256 print ("wrong?") | |
257 | |
258 if feature.type == "misc_feature": | |
259 pass | |
260 else: | |
261 genomeGraph.add((generalURI,RDF.type,createClass(coreURI[feature_type.lower().title()], root=False))) | |
262 if feature_type.lower() != "rrna" and feature_type.lower() != "trna" and feature_type.lower() != "tmrna" and feature_type.lower() != "ncrna": | |
263 SubClassOfDict[feature_type.lower().title()] = 1 | |
264 for key in feature.qualifiers: | |
265 values = feature.qualifiers[key] | |
266 if key == "translation": | |
267 pass | |
268 elif type(values) == list: | |
269 for v in values: | |
270 int_add(generalURI,coreURI[key.lower()],v) | |
271 else: | |
272 int_add(generalURI,coreURI[key.lower()],values) | |
273 if feature.type == "CDS": | |
274 try: | |
275 #Feature is normally submitted to this function | |
276 #IF a subfeature is submitted it is submitted as a feature | |
277 #And subfeature variable will contain the superfeature | |
278 if superfeature: | |
279 codon = superfeature.qualifiers["transl_table"][0] | |
280 # else: | |
281 # codon = subfeature.qualifiers["transl_table"][0] | |
282 except: | |
283 #Default codon table 11 | |
284 codon = "11" | |
285 #Protein linkage | |
286 translation = "" | |
287 try: | |
288 translation = feature.qualifiers["translation"][0].strip("*") | |
289 except KeyError: | |
290 #When protein sequence is not given... | |
291 if len(feature.location.parts) > 1: | |
292 #Exon boundaries? | |
293 seq = '' | |
294 for loc in feature.location: | |
295 seq += record.seq[loc] | |
296 if int(feature.location.strand) == -1: | |
297 seq = Seq(seq).complement() | |
298 else: | |
299 seq = Seq(seq) | |
300 translation = str(seq.translate(feature.qualifiers["transl_table"][0])) | |
301 elif int(feature.location.strand) == -1: | |
302 if str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon)).strip("*") != translation: | |
303 if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0: | |
304 translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon)) | |
305 else: | |
306 translation = '' | |
307 elif int(feature.location.strand) == +1: | |
308 if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0: | |
309 translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].translate(codon)) | |
310 else: | |
311 translation = '' | |
312 | |
313 if translation: | |
314 translation = list(translation) | |
315 translation[0] = "M" | |
316 translation = ''.join(translation).strip("*") | |
317 if "*" in translation: | |
318 pass | |
319 | |
320 translation = translation.encode('utf-8') | |
321 md5_protein = hashlib.md5(translation).hexdigest() | |
322 proteinURI = coreURI["protein/"+md5_protein] | |
323 genomeGraph.add((generalURI,coreURI["protein"],proteinURI)) | |
324 for key in feature.qualifiers: | |
325 for v in feature.qualifiers[key]: | |
326 if key == "translation": | |
327 genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein))) | |
328 genomeGraph.add((proteinURI,coreURI["sequence"],Literal(translation))) | |
329 genomeGraph.add((proteinURI,RDF.type,proteinClass)) | |
330 else: | |
331 for v in feature.qualifiers[key]: | |
332 int_add(generalURI,coreURI[key.lower()],v) | |
333 | |
334 def int_add(subject, predicate, obj): | |
335 try: | |
336 object_float = float(obj.replace('"','')) | |
337 object_int = int(obj.replace('"','')) | |
338 if object_int == object_float: | |
339 genomeGraph.add((subject,predicate,Literal(object_int))) | |
340 else: | |
341 genomeGraph.add((subject,predicate,Literal(object_float))) | |
342 except: | |
343 genomeGraph.add((subject,predicate,Literal(obj.replace('"','')))) | |
344 | |
345 def save(): | |
346 data = genomeGraph.serialize(format='turtle') | |
347 open(sys.argv[sys.argv.index("-output")+1],"wb").write(data) | |
348 | |
349 def subClassOfBuilder(): | |
350 for subclass in SubClassOfDict: | |
351 genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing)) | |
352 genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Feature"])) | |
353 | |
354 def subClassOfBuilderRna(): | |
355 for subclass in SubClassOfDictRna: | |
356 genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing)) | |
357 genomeGraph.add((coreURI["Rna"],RDFS.subClassOf,coreURI["Feature"])) | |
358 genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"])) | |
359 genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"])) | |
360 genomeGraph.add((coreURI[subclass],RDF.type,OWL.Class)) | |
361 | |
362 def main(): | |
363 tmp() | |
364 gbk_parser() | |
365 subClassOfBuilder() | |
366 subClassOfBuilderRna() | |
367 save() | |
368 cleantmp() | |
369 | |
370 if __name__ == "__main__": | |
371 main() |