16
+ − 1 #!/usr/bin/env python3.4
+ − 2 # Author: Jasper Jan Koehorst
+ − 3 # Date created: Feb 21 2015
+ − 4 # Function: generation of a RDF file from Genbank/EMBL
+ − 5
+ − 6 import warnings
+ − 7 warnings.filterwarnings("ignore")
+ − 8
+ − 9 def delete_galaxy():
+ − 10 import sys
+ − 11 for index, path in enumerate(sys.path):
+ − 12 if "galaxy-dist/" in path:
+ − 13 sys.path[index] = ''
+ − 14
+ − 15 #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. This is not an elegant solution but it works for now.
+ − 16 delete_galaxy()
+ − 17
+ − 18 from Bio import SeqIO
+ − 19 # Import RDFLib's default Graph implementation.
+ − 20 import os, sys
+ − 21 from Bio.Seq import Seq
+ − 22
+ − 23 from rdflib import Graph, URIRef, Literal,Namespace,RDF,RDFS,OWL, plugin
+ − 24 from rdflib.store import Store
+ − 25 import hashlib
+ − 26 store = plugin.get('IOMemory', Store)()
+ − 27
+ − 28 global URI
+ − 29 URI = "http://csb.wur.nl/genome/"
+ − 30 global seeAlso
+ − 31 seeAlso = "rdfs:seeAlso"
+ − 32 global coreURI
+ − 33 coreURI = Namespace(URI)
+ − 34
+ − 35 global SubClassOfDict
+ − 36 SubClassOfDict = {}
+ − 37 global SubClassOfDictRna
+ − 38 SubClassOfDictRna = {}
+ − 39
+ − 40 def createClass(uri, root=True):
+ − 41 genomeGraph.add((uri,RDF.type,OWL.Class))
+ − 42 if root:
+ − 43 genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
+ − 44 return uri
+ − 45
+ − 46 def tmp():
+ − 47 import time
+ − 48 global tmpFolder
+ − 49 tmpFolder = "/tmp/"+str(time.time())+"/"
+ − 50 os.mkdir(tmpFolder)
+ − 51
+ − 52 def cleantmp():
+ − 53 os.system("ls "+tmpFolder)
+ − 54 os.system("rm -rf "+tmpFolder)
+ − 55
+ − 56 def crawler():
+ − 57 #From input folder it looks for GBK file (gz files are in progress)
+ − 58 input_file = sys.argv[sys.argv.index("-input")+1]
+ − 59 gbk_parser(input_file)
+ − 60
+ − 61 def gbk_parser():
+ − 62 prevObjStart = -1
+ − 63 prevObjStop = -1
+ − 64 store = plugin.get('IOMemory', Store)()
+ − 65 global genomeGraph
+ − 66 genomeGraph = Graph(store,URIRef(URI))
+ − 67 genomeGraph.bind("ssb",coreURI)
+ − 68 input_file = sys.argv[sys.argv.index("-input")+1]
+ − 69
+ − 70 #CLASS definitions
+ − 71 genomeClass = createClass(coreURI["Genome"], root=True)
+ − 72 typeClass = createClass(coreURI["DnaObject"], root=True)
+ − 73 createClass(coreURI["Protein"], root=True)
+ − 74 pubmedClass = createClass(coreURI["Pubmed"], root=True)
+ − 75 miscClass = createClass(coreURI["MiscFeature"], root=False)
+ − 76 createClass(coreURI["Feature"], root=True)
+ − 77 SubClassOfDict["MiscFeature"] = 1
+ − 78 SubClassOfDictRna["Trna"] = 1
+ − 79 SubClassOfDictRna["Rrna"] = 1
+ − 80 SubClassOfDictRna["Tmrna"] = 1
+ − 81 SubClassOfDictRna["Ncrna"] = 1
+ − 82
+ − 83 # codon = "11" #Default initialization if no CDS are present
+ − 84 ##################
+ − 85 weird_chars = list(''',./?<>:;"'|\}]{[+=_-)(*&^%$#@!±§~` ''')
+ − 86 scaf_value = 0
+ − 87 #Which files are already done
+ − 88 ########
+ − 89 formatGBK = sys.argv[sys.argv.index("-format")+1]
+ − 90 for record in SeqIO.parse(input_file, formatGBK):
+ − 91 #Read first feature for genome name and information...
+ − 92 #Ignore the empty GBK file due to the lack of features?
+ − 93
+ − 94 for index, feature in enumerate(record.features):
+ − 95 if index == 0:
+ − 96 if "-identifier" in sys.argv:
+ − 97 genome = sys.argv[sys.argv.index("-identifier")+1]
+ − 98 else:
+ − 99 try:
+ − 100 genome = feature.qualifiers["organism"][0].replace(" ","_")
+ − 101 except:
+ − 102 #BUG: THIS IS A TEMP FIX, USE GALAXY -IDENTIFIER TO CAPTURE THIS
+ − 103 genome = "XNoneX"
+ − 104 for char in weird_chars:
+ − 105 genome = genome.replace(char,"_")
+ − 106
+ − 107 try:
+ − 108 gi = record.annotations["gi"]
+ − 109 typ = str(gi)
+ − 110 except:
+ − 111 try:
+ − 112 gi = record.annotations["accessions"][0]
+ − 113 typ = str(gi)
+ − 114 except:
+ − 115 scaf_value += 1
+ − 116 typ = "scaffold_"+str(scaf_value)
+ − 117 genomeURI = coreURI[genome]
+ − 118 gbkURI = coreURI[genome + "/" + typ]
+ − 119 #To contig connection to connect all data to it
+ − 120 genomeGraph.add((genomeURI, coreURI["dnaobject"] , gbkURI))
+ − 121
+ − 122 #General genome features also stored in the class...
+ − 123 if "genome" in feature.qualifiers:
+ − 124 genomeGraph.add((genomeURI, coreURI["organism"],Literal(feature.qualifiers["organism"][0])))
+ − 125 if "strain" in feature.qualifiers:
+ − 126 genomeGraph.add((genomeURI, coreURI["strain"],Literal(feature.qualifiers["strain"][0])))
+ − 127 if "taxonomy" in record.annotations:
+ − 128 for taxon in record.annotations["taxonomy"]:
+ − 129 genomeGraph.add((genomeURI, coreURI["taxonomy"],Literal(taxon)))
+ − 130 record.annotations["taxonomy"] = []
+ − 131 #Genome sequence#
+ − 132 sequence = str(record.seq)
+ − 133 #Verify if sequence was not empty and is now full of X or N
+ − 134 filtered_sequence = sequence.replace("X","").replace("N","")
+ − 135 if len(filtered_sequence) == 0:
+ − 136 sequence = ""
+ − 137 #Record parsing#
+ − 138 for annot in record.annotations:
+ − 139 if type(record.annotations[annot]) == list:
+ − 140 if annot == "references":
+ − 141 for references in record.annotations[annot]:
+ − 142 if references.pubmed_id != "":
+ − 143 pubmed = references.pubmed_id
+ − 144 genomeGraph.add((gbkURI, coreURI[annot.lower()] , coreURI["pubmed/"+pubmed]))
+ − 145 obj_dict = references.__dict__
+ − 146 for key in obj_dict:
+ − 147 genomeGraph.add((coreURI["pubmed/"+pubmed], coreURI[key.lower()], Literal(str(obj_dict[key]))))
+ − 148 genomeGraph.add((coreURI["pubmed/"+pubmed], RDF.type, pubmedClass))
+ − 149
+ − 150 else:
+ − 151 for a in record.annotations[annot]:
+ − 152 int_add(gbkURI,coreURI[annot.lower()],str(a))
+ − 153 else:
+ − 154 int_add(gbkURI,coreURI[annot.lower()],str(record.annotations[annot]))
+ − 155
+ − 156 #####END of RECORD####
+ − 157 if len(sequence) > 0:
+ − 158 genomeGraph.add((gbkURI, coreURI["sequence"] , Literal(sequence)))
+ − 159 genomeGraph.add((genomeURI, RDF.type,genomeClass))
+ − 160 genomeGraph.add((gbkURI, RDF.type,typeClass))
+ − 161 for key in feature.qualifiers:
+ − 162 genomeGraph.add((gbkURI, coreURI[key.lower()] , Literal(feature.qualifiers[key][0])))
+ − 163 #break
+ − 164 else: #The rest of the GBK file
+ − 165 feature_type = feature.type
+ − 166 end = str(feature.location.end).replace(">","").replace("<","")
+ − 167 start = str(feature.location.start).replace(">","").replace("<","")
+ − 168
+ − 169 strand = str(feature.location.strand)
+ − 170
+ − 171 if strand == 'None':
+ − 172 strand = 0
+ − 173 else:
+ − 174 if feature.type == "misc_feature": #Store as part of previous cds or something...
+ − 175 if strand == "-1":
+ − 176 miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(end)+"_"+str(start)]
+ − 177 else:
+ − 178 miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(start)+"_"+str(end)]
+ − 179
+ − 180 # TODO: Check if biopython has an overlap function...
+ − 181 if int(prevObjStart) <= int(start):
+ − 182 if int(end) <= int(prevObjStop):
+ − 183 pass
+ − 184 # genomeGraph.add((typeURI,coreURI["feature"],miscURI))
+ − 185 # genomeGraph.add((miscURI,RDF.type,miscClass))
+ − 186 else:
+ − 187 genomeGraph.add((gbkURI, coreURI["feature"] , miscURI))
+ − 188 genomeGraph.add((miscURI,RDF.type,miscClass))
+ − 189 else:
+ − 190 genomeGraph.add((gbkURI, coreURI["feature"] , miscURI))
+ − 191 genomeGraph.add((miscURI,RDF.type,miscClass))
+ − 192
+ − 193 store_general_information(miscURI,feature,record)
+ − 194 else:
+ − 195 prevObjStart = start
+ − 196 prevObjStop = end
+ − 197
+ − 198 if strand == "-1":
+ − 199 typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(end)+"_"+str(start)]
+ − 200 else:
+ − 201 typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(start)+"_"+str(end)]
+ − 202
+ − 203 #Contig specific connection
+ − 204 genomeGraph.add((gbkURI, coreURI["feature"] , typeURI))
+ − 205 ############################
+ − 206
+ − 207 store_general_information(typeURI,feature,record)
+ − 208
+ − 209 for subfeature in feature.sub_features:
+ − 210 strand = str(subfeature.location.strand)
+ − 211 subfeature_type = subfeature.type
+ − 212 end = str(subfeature.location.end).replace(">","").replace("<","")
+ − 213 start = str(subfeature.location.start).replace(">","").replace("<","")
+ − 214
+ − 215 if strand == "-1":
+ − 216 subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(end)+"_"+str(start)]
+ − 217 else:
+ − 218 subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(start)+"_"+str(end)]
+ − 219 genomeGraph.add((typeURI, coreURI["feature"] , subURI))
+ − 220 store_general_information(subURI,subfeature,record,feature)
+ − 221
+ − 222
+ − 223 def store_general_information(generalURI,feature,record,superfeature=""):
+ − 224 proteinClass = createClass(coreURI["Protein"], root=True)
+ − 225 sequence = str(record.seq)
+ − 226 cds_sequence = str(feature.extract(sequence))
+ − 227 #Fixes the 0 count instead of 1-count in biopython vs humans
+ − 228 feature_type = feature.type
+ − 229 end = str(feature.location.end).replace(">","").replace("<","")
+ − 230 start = str(feature.location.start).replace(">","").replace("<","")
+ − 231 strand = str(feature.location.strand)
+ − 232 if strand == "None":
+ − 233 strand = 0
+ − 234
+ − 235 genomeGraph.add((generalURI,coreURI["sourcedb"],Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
+ − 236
+ − 237 if strand == "-1":
+ − 238 genomeGraph.add((generalURI,coreURI["end"],Literal(int(start)+1)))
+ − 239 genomeGraph.add((generalURI,coreURI["begin"],Literal(int(end))))
+ − 240 else:
+ − 241 genomeGraph.add((generalURI,coreURI["begin"],Literal(int(start)+1)))
+ − 242 genomeGraph.add((generalURI,coreURI["end"],Literal(int(end))))
+ − 243 genomeGraph.add((generalURI,coreURI["strand"],Literal(int(strand))))
+ − 244 if feature.type != "misc_feature":
+ − 245 try:
+ − 246 genomeGraph.add((generalURI,coreURI["sequence"],Literal(cds_sequence)))
+ − 247 except: #When protein sequence is not given for whatever reason
+ − 248 print ("wrong?")
+ − 249
+ − 250 if feature.type == "misc_feature":
+ − 251 pass
+ − 252 else:
+ − 253 genomeGraph.add((generalURI,RDF.type,createClass(coreURI[feature_type.lower().title()], root=False)))
+ − 254 if feature_type.lower() != "rrna" and feature_type.lower() != "trna" and feature_type.lower() != "tmrna" and feature_type.lower() != "ncrna":
+ − 255 SubClassOfDict[feature_type.lower().title()] = 1
+ − 256 for key in feature.qualifiers:
+ − 257 values = feature.qualifiers[key]
+ − 258 if key == "translation":
+ − 259 pass
+ − 260 elif type(values) == list:
+ − 261 for v in values:
+ − 262 int_add(generalURI,coreURI[key.lower()],v)
+ − 263 else:
+ − 264 int_add(generalURI,coreURI[key.lower()],values)
+ − 265 if feature.type == "CDS":
+ − 266 try:
+ − 267 #Feature is normally submitted to this function
+ − 268 #IF a subfeature is submitted it is submitted as a feature
+ − 269 #And subfeature variable will contain the superfeature
+ − 270 if superfeature:
+ − 271 codon = superfeature.qualifiers["transl_table"][0]
+ − 272 except:
+ − 273 #Default codon table 11
+ − 274 codon = "11"
+ − 275 #Protein linkage
+ − 276 translation = ""
+ − 277 try:
+ − 278 translation = feature.qualifiers["translation"][0].strip("*")
+ − 279 except KeyError:
+ − 280 #When protein sequence is not given...
+ − 281 if len(feature.location.parts) > 1:
+ − 282 #Exon boundaries?
+ − 283 seq = ''
+ − 284 for loc in feature.location:
+ − 285 seq += record.seq[loc]
+ − 286 if int(feature.location.strand) == -1:
+ − 287 seq = Seq(seq).complement()
+ − 288 else:
+ − 289 seq = Seq(seq)
+ − 290 translation = str(seq.translate(feature.qualifiers["transl_table"][0]))
+ − 291 elif int(feature.location.strand) == -1:
+ − 292 if str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon)).strip("*") != translation:
+ − 293 if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:
+ − 294 translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon))
+ − 295 else:
+ − 296 translation = ''
+ − 297 elif int(feature.location.strand) == +1:
+ − 298 if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:
+ − 299 translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].translate(codon))
+ − 300 else:
+ − 301 translation = ''
+ − 302
+ − 303 if translation:
+ − 304 translation = list(translation)
+ − 305 translation[0] = "M"
+ − 306 translation = ''.join(translation).strip("*")
+ − 307 if "*" in translation:
+ − 308 pass
+ − 309
+ − 310 translation = translation.encode('utf-8')
+ − 311 md5_protein = hashlib.md5(translation).hexdigest()
+ − 312 proteinURI = coreURI["protein/"+md5_protein]
+ − 313 genomeGraph.add((generalURI,coreURI["protein"],proteinURI))
+ − 314 for key in feature.qualifiers:
+ − 315 for v in feature.qualifiers[key]:
+ − 316 if key == "translation":
+ − 317 genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein)))
+ − 318 genomeGraph.add((proteinURI,coreURI["sequence"],Literal(translation)))
+ − 319 genomeGraph.add((proteinURI,RDF.type,proteinClass))
+ − 320 else:
+ − 321 for v in feature.qualifiers[key]:
+ − 322 int_add(generalURI,coreURI[key.lower()],v)
+ − 323
+ − 324 def int_add(subject, predicate, obj):
+ − 325 try:
+ − 326 object_float = float(obj.replace('"',''))
+ − 327 object_int = int(obj.replace('"',''))
+ − 328 if object_int == object_float:
+ − 329 genomeGraph.add((subject,predicate,Literal(object_int)))
+ − 330 else:
+ − 331 genomeGraph.add((subject,predicate,Literal(object_float)))
+ − 332 except:
+ − 333 genomeGraph.add((subject,predicate,Literal(obj.replace('"',''))))
+ − 334
+ − 335 def save():
+ − 336 data = genomeGraph.serialize(format='turtle')
+ − 337 open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
+ − 338
+ − 339 def subClassOfBuilder():
+ − 340 for subclass in SubClassOfDict:
+ − 341 genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))
+ − 342 genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Feature"]))
+ − 343
+ − 344 def subClassOfBuilderRna():
+ − 345 for subclass in SubClassOfDictRna:
+ − 346 genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))
+ − 347 genomeGraph.add((coreURI["Rna"],RDFS.subClassOf,coreURI["Feature"]))
+ − 348 genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"]))
+ − 349 genomeGraph.add((coreURI[subclass],RDF.type,OWL.Class))
+ − 350
+ − 351 def main():
+ − 352 tmp()
+ − 353 gbk_parser()
+ − 354 subClassOfBuilder()
+ − 355 subClassOfBuilderRna()
+ − 356 save()
+ − 357 cleantmp()
+ − 358
+ − 359 if __name__ == "__main__":
+ − 360 main()