view fasta2rdf/fastatordf.py @ 9:3f4f1cd22a6a

FASTA 2 RDF code cleanup
author jjkoehorst <jasperkoehorst@gmail.com>
date Sat, 21 Feb 2015 15:38:26 +0100
parents ec73c34af97b
children
line wrap: on
line source

#!/usr/bin/env python3.4
# Author: Jasper Jan Koehorst
# Date created: Jan 22 2015
# Function: generation of a RDF file from a genome fasta file


# from io import StringIO
from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin
# import rdflib
from rdflib.store import Store
import sys

store = plugin.get('IOMemory', Store)()

global URI
URI = "http://csb.wur.nl/genome/"
global seeAlso
seeAlso = "rdfs:seeAlso"
global coreURI
coreURI = Namespace(URI)
global genomeGraph
store = plugin.get('IOMemory', Store)()
genomeGraph = Graph(store,URIRef(URI))
genomeGraph.bind("ssb",coreURI)

def delete_galaxy():	
	for index, path in enumerate(sys.path):
		if "galaxy-dist/" in path:
			sys.path[index] = ''

def createClass(uri):
	genomeGraph.add((uri,RDF.type,OWL.Class))
	genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
	return uri

def fasta_parser(input_file):
	createClass(coreURI["Genome"])            #Genome class
	createClass(coreURI["Type"])                #Type class (Chr,Pls,Scaffold)

	genomeDict = {}
	
	sequence = ""
	genomeID = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_")
	if genomeID == 'None':
		genomeID = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_")

	genomeURI = coreURI[genomeID]
	for index, element in enumerate(sys.argv):
		if '-organism' == element:
			genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1])))
		if '-ncbi_taxid' == element:
			genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1])))
		if '-idtag' == element:
			genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
		if '-ids' == element:
			genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))

	genomeDict[genomeID] = {}
	
	#Generating genome dictionary
	data = open(input_file).readlines()
	fastadict = {}
	key = ""
	for index, line in enumerate(data):
		if ">" == line[0]:
			key = line.strip(">").strip()
			fastadict[key] = ""
		else:
			fastadict[key] += line.strip()

	genomeClass = createClass(coreURI["Genome"])
	typeClass = createClass(coreURI["DnaObject"])
	for index, genome in enumerate(fastadict):
		typeURI = coreURI[genomeID + "/dnaobject_" + str(index)]
		sequence = fastadict[genome]
		genomeGraph.add((genomeURI, coreURI["dnaobject"] , typeURI))
		genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
		genomeGraph.add((typeURI, coreURI["sequence"] ,  Literal(sequence)))
		genomeGraph.add((typeURI, coreURI["header"], Literal(genome)))
		genomeGraph.add((typeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
		genomeGraph.add((genomeURI, RDF.type,genomeClass))
		genomeGraph.add((typeURI, RDF.type,typeClass))

def save():
	data = genomeGraph.serialize(format='turtle')
	open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)

def main():
	input_file = sys.argv[sys.argv.index("-input")+1]
	fasta_parser(input_file)
	save()

if __name__ == '__main__':
	#Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function.
	delete_galaxy()
	main()