diff conversion/fasta2rdf/fastatordf.py @ 29:dd59731d50b5

Biopython and python3.4 inclusion test.
author Jasper Koehorst <jasperkoehorst@gmail.com>
date Wed, 25 Feb 2015 08:25:32 +0100
parents 74b8ba5e2d5b
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/conversion/fasta2rdf/fastatordf.py	Wed Feb 25 08:25:32 2015 +0100
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3.4
+# Author: Jasper Jan Koehorst
+# Date created: Jan 22 2015
+# Function: generation of a RDF file from a genome fasta file
+
+
+# from io import StringIO
+from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin
+# import rdflib
+from rdflib.store import Store
+import sys
+
+store = plugin.get('IOMemory', Store)()
+
+global URI
+URI = "http://csb.wur.nl/genome/"
+global seeAlso
+seeAlso = "rdfs:seeAlso"
+global coreURI
+coreURI = Namespace(URI)
+global genomeGraph
+store = plugin.get('IOMemory', Store)()
+genomeGraph = Graph(store,URIRef(URI))
+genomeGraph.bind("ssb",coreURI)
+
+def delete_galaxy():	
+	for index, path in enumerate(sys.path):
+		if "galaxy-dist/" in path:
+			sys.path[index] = ''
+
+def createClass(uri):
+	genomeGraph.add((uri,RDF.type,OWL.Class))
+	genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
+	return uri
+
+def fasta_parser(input_file):
+	createClass(coreURI["Genome"])            #Genome class
+	createClass(coreURI["Type"])                #Type class (Chr,Pls,Scaffold)
+
+	genomeDict = {}
+	
+	sequence = ""
+	genomeID = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_")
+	if genomeID == 'None':
+		genomeID = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_")
+
+	genomeURI = coreURI[genomeID]
+	for index, element in enumerate(sys.argv):
+		if '-organism' == element:
+			genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1])))
+		if '-ncbi_taxid' == element:
+			genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1])))
+		if '-idtag' == element:
+			genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
+		if '-ids' == element:
+			genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
+
+	genomeDict[genomeID] = {}
+	
+	#Generating genome dictionary
+	data = open(input_file).readlines()
+	fastadict = {}
+	key = ""
+	for index, line in enumerate(data):
+		if ">" == line[0]:
+			key = line.strip(">").strip()
+			fastadict[key] = ""
+		else:
+			fastadict[key] += line.strip()
+
+	genomeClass = createClass(coreURI["Genome"])
+	typeClass = createClass(coreURI["DnaObject"])
+	for index, genome in enumerate(fastadict):
+		typeURI = coreURI[genomeID + "/dnaobject_" + str(index)]
+		sequence = fastadict[genome]
+		genomeGraph.add((genomeURI, coreURI["dnaobject"] , typeURI))
+		genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
+		genomeGraph.add((typeURI, coreURI["sequence"] ,  Literal(sequence)))
+		genomeGraph.add((typeURI, coreURI["header"], Literal(genome)))
+		genomeGraph.add((typeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
+		genomeGraph.add((genomeURI, RDF.type,genomeClass))
+		genomeGraph.add((typeURI, RDF.type,typeClass))
+
+def save():
+	data = genomeGraph.serialize(format='turtle')
+	open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
+
+def main():
+	input_file = sys.argv[sys.argv.index("-input")+1]
+	fasta_parser(input_file)
+	save()
+
+if __name__ == '__main__':
+	#Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function.
+	delete_galaxy()
+	main()
+