Mercurial > repos > jjkoehorst > sapp
diff gbk2rdf/gbktordf.py @ 3:db04e12b8779
Uploaded
author | jjkoehorst |
---|---|
date | Sat, 21 Feb 2015 07:28:39 -0500 |
parents | |
children | ec73c34af97b |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gbk2rdf/gbktordf.py Sat Feb 21 07:28:39 2015 -0500 @@ -0,0 +1,371 @@ +#!/usr/bin/env python3.4 +# Author: Jasper Jan Koehorst +# Date created: Feb 21 2015 +# Function: generation of a RDF file from Genbank/EMBL + +import warnings +warnings.filterwarnings("ignore") + +def delete_galaxy(): + import sys + for index, path in enumerate(sys.path): + if "galaxy-dist/" in path: + sys.path[index] = '' + +#Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. This is not an elegant solution but it works for now. +delete_galaxy() + +from Bio import SeqIO +# Import RDFLib's default Graph implementation. +import os, sys +from Bio.Seq import Seq + +from rdflib import Graph, URIRef, Literal,Namespace,RDF,RDFS,OWL, plugin +from rdflib.store import Store +import hashlib +store = plugin.get('IOMemory', Store)() + +global URI +URI = "http://csb.wur.nl/genome/" +global seeAlso +seeAlso = "rdfs:seeAlso" +global coreURI +coreURI = Namespace(URI) + +global SubClassOfDict +SubClassOfDict = {} +global SubClassOfDictRna +SubClassOfDictRna = {} + +def createClass(uri, root=True): + genomeGraph.add((uri,RDF.type,OWL.Class)) + if root: + genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing)) + return uri + +def tmp(): + import time + global tmpFolder + tmpFolder = "/tmp/"+str(time.time())+"/" + os.mkdir(tmpFolder) + +def cleantmp(): + os.system("ls "+tmpFolder) + os.system("rm -rf "+tmpFolder) + +def crawler(): + #From input folder it looks for GBK file (gz files are in progress) + input_file = sys.argv[sys.argv.index("-input")+1] + gbk_parser(input_file) + +def gbk_parser(): + prevObjStart = -1 + prevObjStop = -1 + store = plugin.get('IOMemory', Store)() + global genomeGraph + genomeGraph = Graph(store,URIRef(URI)) + genomeGraph.bind("ssb",coreURI) + input_file = sys.argv[sys.argv.index("-input")+1] + + #CLASS definitions + genomeClass = createClass(coreURI["Genome"], root=True) + typeClass = createClass(coreURI["DnaObject"], root=True) + createClass(coreURI["Protein"], root=True) + pubmedClass = createClass(coreURI["Pubmed"], root=True) + miscClass = createClass(coreURI["MiscFeature"], root=False) + createClass(coreURI["Feature"], root=True) + SubClassOfDict["MiscFeature"] = 1 + SubClassOfDictRna["Trna"] = 1 + SubClassOfDictRna["Rrna"] = 1 + SubClassOfDictRna["Tmrna"] = 1 + SubClassOfDictRna["Ncrna"] = 1 + +# codon = "11" #Default initialization if no CDS are present + ################## + weird_chars = list(''',./?<>:;"'|\}]{[+=_-)(*&^%$#@!±§~` ''') + scaf_value = 0 + #Which files are already done + ######## + formatGBK = sys.argv[sys.argv.index("-format")+1] + for record in SeqIO.parse(input_file, formatGBK): + #Read first feature for genome name and information... + #Ignore the empty GBK file due to the lack of features? + + for index, feature in enumerate(record.features): + if index == 0: + if "-identifier" in sys.argv: + genome = sys.argv[sys.argv.index("-identifier")+1] + else: + try: + genome = feature.qualifiers["organism"][0].replace(" ","_") + except: + #BUG: THIS IS A TEMP FIX, USE GALAXY -IDENTIFIER TO CAPTURE THIS + genome = "XNoneX" + for char in weird_chars: + genome = genome.replace(char,"_") + + try: + gi = record.annotations["gi"] + typ = str(gi) + except: + scaf_value += 1 + typ = "scaffold_"+str(scaf_value) + genomeURI = coreURI[genome] + gbkURI = coreURI[genome + "/" + typ] + #To contig connection to connect all data to it + genomeGraph.add((genomeURI, coreURI["dnaobject"] , gbkURI)) + + #General genome features also stored in the class... + if "genome" in feature.qualifiers: + genomeGraph.add((genomeURI, coreURI["organism"],Literal(feature.qualifiers["organism"][0]))) + if "strain" in feature.qualifiers: + genomeGraph.add((genomeURI, coreURI["strain"],Literal(feature.qualifiers["strain"][0]))) + if "taxonomy" in record.annotations: + for taxon in record.annotations["taxonomy"]: + genomeGraph.add((genomeURI, coreURI["taxonomy"],Literal(taxon))) + record.annotations["taxonomy"] = [] + #Genome sequence# + sequence = str(record.seq) + #Verify if sequence was not empty and is now full of X or N + filtered_sequence = sequence.replace("X","").replace("N","") + if len(filtered_sequence) == 0: + sequence = "" + #Record parsing# + for annot in record.annotations: + if type(record.annotations[annot]) == list: + if annot == "references": + for references in record.annotations[annot]: + if references.pubmed_id != "": + pubmed = references.pubmed_id + genomeGraph.add((gbkURI, coreURI[annot.lower()] , coreURI["pubmed/"+pubmed])) + obj_dict = references.__dict__ + for key in obj_dict: + genomeGraph.add((coreURI["pubmed/"+pubmed], coreURI[key.lower()], Literal(str(obj_dict[key])))) + genomeGraph.add((coreURI["pubmed/"+pubmed], RDF.type, pubmedClass)) + + else: + for a in record.annotations[annot]: + int_add(gbkURI,coreURI[annot.lower()],str(a)) + else: + int_add(gbkURI,coreURI[annot.lower()],str(record.annotations[annot])) + + + #####END of RECORD#### + if len(sequence) > 0: + genomeGraph.add((gbkURI, coreURI["sequence"] , Literal(sequence))) + genomeGraph.add((genomeURI, RDF.type,genomeClass)) + genomeGraph.add((gbkURI, RDF.type,typeClass)) + for key in feature.qualifiers: + genomeGraph.add((gbkURI, coreURI[key.lower()] , Literal(feature.qualifiers[key][0]))) + #break + else: #The rest of the GBK file + feature_type = feature.type + end = str(feature.location.end).replace(">","").replace("<","") + start = str(feature.location.start).replace(">","").replace("<","") + + strand = str(feature.location.strand) + + if strand == 'None': + strand = 0 + +# if feature_type == "gene": +# gene = feature + #Store gene in next feature.... +# gene_location_start = end = str(gene.location.end).replace(">","").replace("<","") +# gene_location_stop = str(gene.location.start).replace(">","").replace("<","") +# gene_qualifiers = gene.qualifiers + else: + if feature.type == "misc_feature": #Store as part of previous cds or something... + if strand == "-1": + miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(end)+"_"+str(start)] + else: + miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(start)+"_"+str(end)] + + # genomeGraph.add((generalURI,coreURI["subFeature"],miscURI)) + + # TODO: Check if biopython has an overlap function... + if int(prevObjStart) <= int(start): + if int(end) <= int(prevObjStop): + pass +# genomeGraph.add((typeURI,coreURI["feature"],miscURI)) +# genomeGraph.add((miscURI,RDF.type,miscClass)) + else: + genomeGraph.add((gbkURI, coreURI["feature"] , miscURI)) + genomeGraph.add((miscURI,RDF.type,miscClass)) + else: + genomeGraph.add((gbkURI, coreURI["feature"] , miscURI)) + genomeGraph.add((miscURI,RDF.type,miscClass)) + + store_general_information(miscURI,feature,record) + else: + prevObjStart = start + prevObjStop = end + + + if strand == "-1": + typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(end)+"_"+str(start)] + else: + typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(start)+"_"+str(end)] + +# cds_sequence = str(feature.extract(sequence)) + #Contig specific connection + + genomeGraph.add((gbkURI, coreURI["feature"] , typeURI)) + ############################ + + store_general_information(typeURI,feature,record) + + for subfeature in feature.sub_features: + strand = str(subfeature.location.strand) + subfeature_type = subfeature.type + end = str(subfeature.location.end).replace(">","").replace("<","") + start = str(subfeature.location.start).replace(">","").replace("<","") + + if strand == "-1": + subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(end)+"_"+str(start)] + else: + subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(start)+"_"+str(end)] + genomeGraph.add((typeURI, coreURI["feature"] , subURI)) + store_general_information(subURI,subfeature,record,feature) + +def store_general_information(generalURI,feature,record,superfeature=""): + proteinClass = createClass(coreURI["Protein"], root=True) + sequence = str(record.seq) + cds_sequence = str(feature.extract(sequence)) + #Fixes the 0 count instead of 1-count in biopython vs humans + feature_type = feature.type + end = str(feature.location.end).replace(">","").replace("<","") + start = str(feature.location.start).replace(">","").replace("<","") + strand = str(feature.location.strand) + if strand == "None": + strand = 0 + + genomeGraph.add((generalURI,coreURI["sourcedb"],Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) + + if strand == "-1": + genomeGraph.add((generalURI,coreURI["end"],Literal(int(start)+1))) + genomeGraph.add((generalURI,coreURI["begin"],Literal(int(end)))) + else: + genomeGraph.add((generalURI,coreURI["begin"],Literal(int(start)+1))) + genomeGraph.add((generalURI,coreURI["end"],Literal(int(end)))) + genomeGraph.add((generalURI,coreURI["strand"],Literal(int(strand)))) + if feature.type != "misc_feature": + try: + genomeGraph.add((generalURI,coreURI["sequence"],Literal(cds_sequence))) + except: #When protein sequence is not given for whatever reason + print ("wrong?") + + if feature.type == "misc_feature": + pass + else: + genomeGraph.add((generalURI,RDF.type,createClass(coreURI[feature_type.lower().title()], root=False))) + if feature_type.lower() != "rrna" and feature_type.lower() != "trna" and feature_type.lower() != "tmrna" and feature_type.lower() != "ncrna": + SubClassOfDict[feature_type.lower().title()] = 1 + for key in feature.qualifiers: + values = feature.qualifiers[key] + if key == "translation": + pass + elif type(values) == list: + for v in values: + int_add(generalURI,coreURI[key.lower()],v) + else: + int_add(generalURI,coreURI[key.lower()],values) + if feature.type == "CDS": + try: + #Feature is normally submitted to this function + #IF a subfeature is submitted it is submitted as a feature + #And subfeature variable will contain the superfeature + if superfeature: + codon = superfeature.qualifiers["transl_table"][0] +# else: +# codon = subfeature.qualifiers["transl_table"][0] + except: + #Default codon table 11 + codon = "11" + #Protein linkage + translation = "" + try: + translation = feature.qualifiers["translation"][0].strip("*") + except KeyError: + #When protein sequence is not given... + if len(feature.location.parts) > 1: + #Exon boundaries? + seq = '' + for loc in feature.location: + seq += record.seq[loc] + if int(feature.location.strand) == -1: + seq = Seq(seq).complement() + else: + seq = Seq(seq) + translation = str(seq.translate(feature.qualifiers["transl_table"][0])) + elif int(feature.location.strand) == -1: + if str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon)).strip("*") != translation: + if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0: + translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon)) + else: + translation = '' + elif int(feature.location.strand) == +1: + if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0: + translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].translate(codon)) + else: + translation = '' + + if translation: + translation = list(translation) + translation[0] = "M" + translation = ''.join(translation).strip("*") + if "*" in translation: + pass + + translation = translation.encode('utf-8') + md5_protein = hashlib.md5(translation).hexdigest() + proteinURI = coreURI["protein/"+md5_protein] + genomeGraph.add((generalURI,coreURI["protein"],proteinURI)) + for key in feature.qualifiers: + for v in feature.qualifiers[key]: + if key == "translation": + genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein))) + genomeGraph.add((proteinURI,coreURI["sequence"],Literal(translation))) + genomeGraph.add((proteinURI,RDF.type,proteinClass)) + else: + for v in feature.qualifiers[key]: + int_add(generalURI,coreURI[key.lower()],v) + +def int_add(subject, predicate, obj): + try: + object_float = float(obj.replace('"','')) + object_int = int(obj.replace('"','')) + if object_int == object_float: + genomeGraph.add((subject,predicate,Literal(object_int))) + else: + genomeGraph.add((subject,predicate,Literal(object_float))) + except: + genomeGraph.add((subject,predicate,Literal(obj.replace('"','')))) + +def save(): + data = genomeGraph.serialize(format='turtle') + open(sys.argv[sys.argv.index("-output")+1],"wb").write(data) + +def subClassOfBuilder(): + for subclass in SubClassOfDict: + genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing)) + genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Feature"])) + +def subClassOfBuilderRna(): + for subclass in SubClassOfDictRna: + genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing)) + genomeGraph.add((coreURI["Rna"],RDFS.subClassOf,coreURI["Feature"])) + genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"])) + genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"])) + genomeGraph.add((coreURI[subclass],RDF.type,OWL.Class)) + +def main(): + tmp() + gbk_parser() + subClassOfBuilder() + subClassOfBuilderRna() + save() + cleantmp() + +if __name__ == "__main__": + main() \ No newline at end of file