Mercurial > repos > jjkoehorst > sapp
view protein2rdf/protein_to_ttl.py @ 14:41bfc53ee550
removal of rnaseq
author | jjkoehorst <jasperkoehorst@gmail.com> |
---|---|
date | Sat, 21 Feb 2015 17:12:28 +0100 |
parents | 0773b11fb822 |
children |
line wrap: on
line source
def delete_galaxy(): import sys for index, path in enumerate(sys.path): if "galaxy-dist/" in path: sys.path[index] = '' #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. delete_galaxy() # from io import StringIO from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin # import rdflib from rdflib.store import Store import sys import hashlib store = plugin.get('IOMemory', Store)() global URI URI = "http://csb.wur.nl/genome/" global seeAlso seeAlso = "rdfs:seeAlso" global coreURI coreURI = Namespace(URI) def createClass(uri): genomeGraph.add((uri,RDF.type,OWL.Class)) genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing)) return uri def fasta_parser(input_file): createClass(coreURI["Protein"]) genome = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_") if genome == '': genome = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_") genomeURI = coreURI[genome] for index, element in enumerate(sys.argv): if '-organism' == element: genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1]))) if '-ncbi_taxid' == element: genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1]))) if '-idtag' == element: genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) if '-diagnosis' == element: genomeGraph.add((genomeURI, coreURI["diagnosis"] , Literal(sys.argv[index+1]))) if '-country' == element: genomeGraph.add((genomeURI, coreURI["country"] , Literal(sys.argv[index+1]))) if '-location' == element: genomeGraph.add((genomeURI, coreURI["location"] , Literal(sys.argv[index+1]))) if '-date' == element: genomeGraph.add((genomeURI, coreURI["date"] , Literal(sys.argv[index+1]))) if '-ids' == element: genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) data = (open(input_file).readlines()) fastadict = {} sequence = "" key = "" for index, line in enumerate(data): if ">" == line[0]: if sequence: fastadict[key] = sequence key = line sequence = "" fastadict[key] = "" else: sequence += line.strip() fastadict[key] = sequence #Create a class, to be the same as all the other genome conversions... #TODO: Proteins are part of cds, cds are part of dnaobject #If CDS is not there... how then? classURI = coreURI[genome + "/" + "protein_fasta"] proteinClass = createClass(coreURI["Protein"]) genomeClass = createClass(coreURI["Genome"]) typeClass = createClass(coreURI["DnaObject"]) cdsClass = createClass(coreURI["Cds"]) #A theoretical begin, end is created to have a workable GBK generation begin = 0 end = 0 genomeGraph.add((genomeURI, RDF.type, genomeClass)) genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) genomeGraph.add((genomeURI, coreURI["dnaobject"] , classURI)) genomeGraph.add((classURI, RDF.type, typeClass)) for protein in fastadict: sequence = fastadict[protein] sequence = sequence.encode('utf-8') end = begin + len(sequence) md5_protein = hashlib.md5(sequence).hexdigest() proteinURI = coreURI["protein/"+md5_protein] cdsURI = coreURI[genome + "/protein_fasta/" + str(begin)+"_"+str(end)] genomeGraph.add((classURI, coreURI["feature"] , cdsURI)) genomeGraph.add((cdsURI, coreURI["begin"] , Literal(begin))) genomeGraph.add((cdsURI, coreURI["end"] , Literal(end))) genomeGraph.add((cdsURI, coreURI["sourcedb"] , Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) genomeGraph.add((cdsURI, coreURI["protein"] , proteinURI)) genomeGraph.add((cdsURI, RDF.type, cdsClass)) genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein))) genomeGraph.add((proteinURI,coreURI["sequence"],Literal(sequence))) genomeGraph.add((proteinURI,RDF.type,proteinClass)) genomeGraph.add((proteinURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) genomeGraph.add((proteinURI, RDF.type, proteinClass)) begin = end def save(): data = genomeGraph.serialize(format='turtle') open(sys.argv[sys.argv.index("-output")+1],"wb").write(data) def main(): store = plugin.get('IOMemory', Store)() global genomeGraph genomeGraph = Graph(store,URIRef(URI)) genomeGraph.bind("ssb",coreURI) input_file = sys.argv[sys.argv.index("-input")+1] fasta_parser(input_file) save() if __name__ == '__main__': main()