Mercurial > repos > jjkoehorst > sapp
view gbk2rdf/gbktordf.py @ 7:c79025539d9b
FASTA to RDF
author | jjkoehorst <jasperkoehorst@gmail.com> |
---|---|
date | Sat, 21 Feb 2015 15:23:15 +0100 |
parents | ec73c34af97b |
children |
line wrap: on
line source
#!/usr/bin/env python3.4 # Author: Jasper Jan Koehorst # Date created: Feb 21 2015 # Function: generation of a RDF file from Genbank/EMBL import warnings warnings.filterwarnings("ignore") def delete_galaxy(): import sys for index, path in enumerate(sys.path): if "galaxy-dist/" in path: sys.path[index] = '' #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. This is not an elegant solution but it works for now. delete_galaxy() from Bio import SeqIO # Import RDFLib's default Graph implementation. import os, sys from Bio.Seq import Seq from rdflib import Graph, URIRef, Literal,Namespace,RDF,RDFS,OWL, plugin from rdflib.store import Store import hashlib store = plugin.get('IOMemory', Store)() global URI URI = "http://csb.wur.nl/genome/" global seeAlso seeAlso = "rdfs:seeAlso" global coreURI coreURI = Namespace(URI) global SubClassOfDict SubClassOfDict = {} global SubClassOfDictRna SubClassOfDictRna = {} def createClass(uri, root=True): genomeGraph.add((uri,RDF.type,OWL.Class)) if root: genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing)) return uri def tmp(): import time global tmpFolder tmpFolder = "/tmp/"+str(time.time())+"/" os.mkdir(tmpFolder) def cleantmp(): os.system("ls "+tmpFolder) os.system("rm -rf "+tmpFolder) def crawler(): #From input folder it looks for GBK file (gz files are in progress) input_file = sys.argv[sys.argv.index("-input")+1] gbk_parser(input_file) def gbk_parser(): prevObjStart = -1 prevObjStop = -1 store = plugin.get('IOMemory', Store)() global genomeGraph genomeGraph = Graph(store,URIRef(URI)) genomeGraph.bind("ssb",coreURI) input_file = sys.argv[sys.argv.index("-input")+1] #CLASS definitions genomeClass = createClass(coreURI["Genome"], root=True) typeClass = createClass(coreURI["DnaObject"], root=True) createClass(coreURI["Protein"], root=True) pubmedClass = createClass(coreURI["Pubmed"], root=True) miscClass = createClass(coreURI["MiscFeature"], root=False) createClass(coreURI["Feature"], root=True) SubClassOfDict["MiscFeature"] = 1 SubClassOfDictRna["Trna"] = 1 SubClassOfDictRna["Rrna"] = 1 SubClassOfDictRna["Tmrna"] = 1 SubClassOfDictRna["Ncrna"] = 1 # codon = "11" #Default initialization if no CDS are present ################## weird_chars = list(''',./?<>:;"'|\}]{[+=_-)(*&^%$#@!±§~` ''') scaf_value = 0 #Which files are already done ######## formatGBK = sys.argv[sys.argv.index("-format")+1] for record in SeqIO.parse(input_file, formatGBK): #Read first feature for genome name and information... #Ignore the empty GBK file due to the lack of features? for index, feature in enumerate(record.features): if index == 0: if "-identifier" in sys.argv: genome = sys.argv[sys.argv.index("-identifier")+1] else: try: genome = feature.qualifiers["organism"][0].replace(" ","_") except: #BUG: THIS IS A TEMP FIX, USE GALAXY -IDENTIFIER TO CAPTURE THIS genome = "XNoneX" for char in weird_chars: genome = genome.replace(char,"_") try: gi = record.annotations["gi"] typ = str(gi) except: try: gi = record.annotations["accessions"][0] typ = str(gi) except: scaf_value += 1 typ = "scaffold_"+str(scaf_value) genomeURI = coreURI[genome] gbkURI = coreURI[genome + "/" + typ] #To contig connection to connect all data to it genomeGraph.add((genomeURI, coreURI["dnaobject"] , gbkURI)) #General genome features also stored in the class... if "genome" in feature.qualifiers: genomeGraph.add((genomeURI, coreURI["organism"],Literal(feature.qualifiers["organism"][0]))) if "strain" in feature.qualifiers: genomeGraph.add((genomeURI, coreURI["strain"],Literal(feature.qualifiers["strain"][0]))) if "taxonomy" in record.annotations: for taxon in record.annotations["taxonomy"]: genomeGraph.add((genomeURI, coreURI["taxonomy"],Literal(taxon))) record.annotations["taxonomy"] = [] #Genome sequence# sequence = str(record.seq) #Verify if sequence was not empty and is now full of X or N filtered_sequence = sequence.replace("X","").replace("N","") if len(filtered_sequence) == 0: sequence = "" #Record parsing# for annot in record.annotations: if type(record.annotations[annot]) == list: if annot == "references": for references in record.annotations[annot]: if references.pubmed_id != "": pubmed = references.pubmed_id genomeGraph.add((gbkURI, coreURI[annot.lower()] , coreURI["pubmed/"+pubmed])) obj_dict = references.__dict__ for key in obj_dict: genomeGraph.add((coreURI["pubmed/"+pubmed], coreURI[key.lower()], Literal(str(obj_dict[key])))) genomeGraph.add((coreURI["pubmed/"+pubmed], RDF.type, pubmedClass)) else: for a in record.annotations[annot]: int_add(gbkURI,coreURI[annot.lower()],str(a)) else: int_add(gbkURI,coreURI[annot.lower()],str(record.annotations[annot])) #####END of RECORD#### if len(sequence) > 0: genomeGraph.add((gbkURI, coreURI["sequence"] , Literal(sequence))) genomeGraph.add((genomeURI, RDF.type,genomeClass)) genomeGraph.add((gbkURI, RDF.type,typeClass)) for key in feature.qualifiers: genomeGraph.add((gbkURI, coreURI[key.lower()] , Literal(feature.qualifiers[key][0]))) #break else: #The rest of the GBK file feature_type = feature.type end = str(feature.location.end).replace(">","").replace("<","") start = str(feature.location.start).replace(">","").replace("<","") strand = str(feature.location.strand) if strand == 'None': strand = 0 else: if feature.type == "misc_feature": #Store as part of previous cds or something... if strand == "-1": miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(end)+"_"+str(start)] else: miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(start)+"_"+str(end)] # TODO: Check if biopython has an overlap function... if int(prevObjStart) <= int(start): if int(end) <= int(prevObjStop): pass # genomeGraph.add((typeURI,coreURI["feature"],miscURI)) # genomeGraph.add((miscURI,RDF.type,miscClass)) else: genomeGraph.add((gbkURI, coreURI["feature"] , miscURI)) genomeGraph.add((miscURI,RDF.type,miscClass)) else: genomeGraph.add((gbkURI, coreURI["feature"] , miscURI)) genomeGraph.add((miscURI,RDF.type,miscClass)) store_general_information(miscURI,feature,record) else: prevObjStart = start prevObjStop = end if strand == "-1": typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(end)+"_"+str(start)] else: typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(start)+"_"+str(end)] #Contig specific connection genomeGraph.add((gbkURI, coreURI["feature"] , typeURI)) ############################ store_general_information(typeURI,feature,record) for subfeature in feature.sub_features: strand = str(subfeature.location.strand) subfeature_type = subfeature.type end = str(subfeature.location.end).replace(">","").replace("<","") start = str(subfeature.location.start).replace(">","").replace("<","") if strand == "-1": subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(end)+"_"+str(start)] else: subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(start)+"_"+str(end)] genomeGraph.add((typeURI, coreURI["feature"] , subURI)) store_general_information(subURI,subfeature,record,feature) def store_general_information(generalURI,feature,record,superfeature=""): proteinClass = createClass(coreURI["Protein"], root=True) sequence = str(record.seq) cds_sequence = str(feature.extract(sequence)) #Fixes the 0 count instead of 1-count in biopython vs humans feature_type = feature.type end = str(feature.location.end).replace(">","").replace("<","") start = str(feature.location.start).replace(">","").replace("<","") strand = str(feature.location.strand) if strand == "None": strand = 0 genomeGraph.add((generalURI,coreURI["sourcedb"],Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) if strand == "-1": genomeGraph.add((generalURI,coreURI["end"],Literal(int(start)+1))) genomeGraph.add((generalURI,coreURI["begin"],Literal(int(end)))) else: genomeGraph.add((generalURI,coreURI["begin"],Literal(int(start)+1))) genomeGraph.add((generalURI,coreURI["end"],Literal(int(end)))) genomeGraph.add((generalURI,coreURI["strand"],Literal(int(strand)))) if feature.type != "misc_feature": try: genomeGraph.add((generalURI,coreURI["sequence"],Literal(cds_sequence))) except: #When protein sequence is not given for whatever reason print ("wrong?") if feature.type == "misc_feature": pass else: genomeGraph.add((generalURI,RDF.type,createClass(coreURI[feature_type.lower().title()], root=False))) if feature_type.lower() != "rrna" and feature_type.lower() != "trna" and feature_type.lower() != "tmrna" and feature_type.lower() != "ncrna": SubClassOfDict[feature_type.lower().title()] = 1 for key in feature.qualifiers: values = feature.qualifiers[key] if key == "translation": pass elif type(values) == list: for v in values: int_add(generalURI,coreURI[key.lower()],v) else: int_add(generalURI,coreURI[key.lower()],values) if feature.type == "CDS": try: #Feature is normally submitted to this function #IF a subfeature is submitted it is submitted as a feature #And subfeature variable will contain the superfeature if superfeature: codon = superfeature.qualifiers["transl_table"][0] except: #Default codon table 11 codon = "11" #Protein linkage translation = "" try: translation = feature.qualifiers["translation"][0].strip("*") except KeyError: #When protein sequence is not given... if len(feature.location.parts) > 1: #Exon boundaries? seq = '' for loc in feature.location: seq += record.seq[loc] if int(feature.location.strand) == -1: seq = Seq(seq).complement() else: seq = Seq(seq) translation = str(seq.translate(feature.qualifiers["transl_table"][0])) elif int(feature.location.strand) == -1: if str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon)).strip("*") != translation: if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0: translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon)) else: translation = '' elif int(feature.location.strand) == +1: if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0: translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].translate(codon)) else: translation = '' if translation: translation = list(translation) translation[0] = "M" translation = ''.join(translation).strip("*") if "*" in translation: pass translation = translation.encode('utf-8') md5_protein = hashlib.md5(translation).hexdigest() proteinURI = coreURI["protein/"+md5_protein] genomeGraph.add((generalURI,coreURI["protein"],proteinURI)) for key in feature.qualifiers: for v in feature.qualifiers[key]: if key == "translation": genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein))) genomeGraph.add((proteinURI,coreURI["sequence"],Literal(translation))) genomeGraph.add((proteinURI,RDF.type,proteinClass)) else: for v in feature.qualifiers[key]: int_add(generalURI,coreURI[key.lower()],v) def int_add(subject, predicate, obj): try: object_float = float(obj.replace('"','')) object_int = int(obj.replace('"','')) if object_int == object_float: genomeGraph.add((subject,predicate,Literal(object_int))) else: genomeGraph.add((subject,predicate,Literal(object_float))) except: genomeGraph.add((subject,predicate,Literal(obj.replace('"','')))) def save(): data = genomeGraph.serialize(format='turtle') open(sys.argv[sys.argv.index("-output")+1],"wb").write(data) def subClassOfBuilder(): for subclass in SubClassOfDict: genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing)) genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Feature"])) def subClassOfBuilderRna(): for subclass in SubClassOfDictRna: genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing)) genomeGraph.add((coreURI["Rna"],RDFS.subClassOf,coreURI["Feature"])) genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"])) genomeGraph.add((coreURI[subclass],RDF.type,OWL.Class)) def main(): tmp() gbk_parser() subClassOfBuilder() subClassOfBuilderRna() save() cleantmp() if __name__ == "__main__": main()