view conversion/protein2rdf/protein_to_ttl.py @ 24:9610ddbca991

.
author jjkoehorst <jasperkoehorst@gmail.com>
date Sat, 21 Feb 2015 19:25:28 +0100
parents 74b8ba5e2d5b
children
line wrap: on
line source

def delete_galaxy():
	import sys
	for index, path in enumerate(sys.path):
		if "galaxy-dist/" in path:
			sys.path[index] = ''

#Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function.
delete_galaxy()

# from io import StringIO
from rdflib import Graph, URIRef, Literal,Namespace,  RDF,RDFS,OWL,  plugin
# import rdflib
from rdflib.store import Store
import sys
import hashlib

store = plugin.get('IOMemory', Store)()

global URI
URI = "http://csb.wur.nl/genome/"
global seeAlso
seeAlso = "rdfs:seeAlso"
global coreURI
coreURI = Namespace(URI)


def createClass(uri):
	genomeGraph.add((uri,RDF.type,OWL.Class))
	genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
	return uri

def fasta_parser(input_file):
	createClass(coreURI["Protein"])
	
	genome = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_")
	if genome == '':
		genome = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_")

	genomeURI = coreURI[genome]
	for index, element in enumerate(sys.argv):
		if '-organism' == element:
			genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1])))
		if '-ncbi_taxid' == element:
			genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1])))
		if '-idtag' == element:
			genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
		if '-diagnosis' == element:
			genomeGraph.add((genomeURI, coreURI["diagnosis"] , Literal(sys.argv[index+1])))
		if '-country' == element:
			genomeGraph.add((genomeURI, coreURI["country"] , Literal(sys.argv[index+1])))
		if '-location' == element:
			genomeGraph.add((genomeURI, coreURI["location"] , Literal(sys.argv[index+1])))
		if '-date' == element:
			genomeGraph.add((genomeURI, coreURI["date"] , Literal(sys.argv[index+1])))
		if '-ids' == element:
			genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))


	
	data = (open(input_file).readlines())
	fastadict = {}
	sequence = ""
	key = ""
	for index, line in enumerate(data):
		if ">" == line[0]:
			if sequence:
				fastadict[key] = sequence
			key = line
			sequence = ""
			fastadict[key] = ""
		else:
			sequence += line.strip()
	fastadict[key] = sequence
	
	#Create a class, to be the same as all the other genome conversions...
	#TODO: Proteins are part of cds, cds are part of dnaobject
	#If CDS is not there... how then?
	classURI = coreURI[genome + "/" + "protein_fasta"]
	proteinClass = createClass(coreURI["Protein"])
	genomeClass = createClass(coreURI["Genome"])
	typeClass = createClass(coreURI["DnaObject"])
	cdsClass = createClass(coreURI["Cds"])
	#A theoretical begin, end is created to have a workable GBK generation	
	begin = 0
	end = 0
	genomeGraph.add((genomeURI, RDF.type, genomeClass))
	genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
	genomeGraph.add((genomeURI, coreURI["dnaobject"] , classURI))
	genomeGraph.add((classURI, RDF.type, typeClass))

	for protein in fastadict:
		sequence = fastadict[protein]
		sequence = sequence.encode('utf-8')
		end = begin + len(sequence)
		md5_protein = hashlib.md5(sequence).hexdigest()
		proteinURI = coreURI["protein/"+md5_protein]
		
		cdsURI = coreURI[genome + "/protein_fasta/" + str(begin)+"_"+str(end)]
		genomeGraph.add((classURI, coreURI["feature"] , cdsURI))	
		genomeGraph.add((cdsURI, coreURI["begin"] , Literal(begin)))	
		genomeGraph.add((cdsURI, coreURI["end"] , Literal(end)))
		genomeGraph.add((cdsURI, coreURI["sourcedb"] , Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
		genomeGraph.add((cdsURI, coreURI["protein"] , proteinURI))
		genomeGraph.add((cdsURI, RDF.type, cdsClass))
		
		

		genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein)))
		genomeGraph.add((proteinURI,coreURI["sequence"],Literal(sequence)))
		genomeGraph.add((proteinURI,RDF.type,proteinClass))
		genomeGraph.add((proteinURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
		genomeGraph.add((proteinURI, RDF.type, proteinClass))
		begin = end

def save():
	data = genomeGraph.serialize(format='turtle')
	open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)

def main():
	store = plugin.get('IOMemory', Store)()
	global genomeGraph
	genomeGraph = Graph(store,URIRef(URI))
	genomeGraph.bind("ssb",coreURI)
	input_file = sys.argv[sys.argv.index("-input")+1]
	fasta_parser(input_file)
	save()

if __name__ == '__main__':
	main()