Mercurial > repos > jjkoehorst > sapp
diff conversion/fasta2rdf/fastatordf.py @ 16:74b8ba5e2d5b
aragorn addition
author | jjkoehorst <jasperkoehorst@gmail.com> |
---|---|
date | Sat, 21 Feb 2015 17:17:06 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/conversion/fasta2rdf/fastatordf.py Sat Feb 21 17:17:06 2015 +0100 @@ -0,0 +1,97 @@ +#!/usr/bin/env python3.4 +# Author: Jasper Jan Koehorst +# Date created: Jan 22 2015 +# Function: generation of a RDF file from a genome fasta file + + +# from io import StringIO +from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin +# import rdflib +from rdflib.store import Store +import sys + +store = plugin.get('IOMemory', Store)() + +global URI +URI = "http://csb.wur.nl/genome/" +global seeAlso +seeAlso = "rdfs:seeAlso" +global coreURI +coreURI = Namespace(URI) +global genomeGraph +store = plugin.get('IOMemory', Store)() +genomeGraph = Graph(store,URIRef(URI)) +genomeGraph.bind("ssb",coreURI) + +def delete_galaxy(): + for index, path in enumerate(sys.path): + if "galaxy-dist/" in path: + sys.path[index] = '' + +def createClass(uri): + genomeGraph.add((uri,RDF.type,OWL.Class)) + genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing)) + return uri + +def fasta_parser(input_file): + createClass(coreURI["Genome"]) #Genome class + createClass(coreURI["Type"]) #Type class (Chr,Pls,Scaffold) + + genomeDict = {} + + sequence = "" + genomeID = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_") + if genomeID == 'None': + genomeID = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_") + + genomeURI = coreURI[genomeID] + for index, element in enumerate(sys.argv): + if '-organism' == element: + genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1]))) + if '-ncbi_taxid' == element: + genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1]))) + if '-idtag' == element: + genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) + if '-ids' == element: + genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) + + genomeDict[genomeID] = {} + + #Generating genome dictionary + data = open(input_file).readlines() + fastadict = {} + key = "" + for index, line in enumerate(data): + if ">" == line[0]: + key = line.strip(">").strip() + fastadict[key] = "" + else: + fastadict[key] += line.strip() + + genomeClass = createClass(coreURI["Genome"]) + typeClass = createClass(coreURI["DnaObject"]) + for index, genome in enumerate(fastadict): + typeURI = coreURI[genomeID + "/dnaobject_" + str(index)] + sequence = fastadict[genome] + genomeGraph.add((genomeURI, coreURI["dnaobject"] , typeURI)) + genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) + genomeGraph.add((typeURI, coreURI["sequence"] , Literal(sequence))) + genomeGraph.add((typeURI, coreURI["header"], Literal(genome))) + genomeGraph.add((typeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) + genomeGraph.add((genomeURI, RDF.type,genomeClass)) + genomeGraph.add((typeURI, RDF.type,typeClass)) + +def save(): + data = genomeGraph.serialize(format='turtle') + open(sys.argv[sys.argv.index("-output")+1],"wb").write(data) + +def main(): + input_file = sys.argv[sys.argv.index("-input")+1] + fasta_parser(input_file) + save() + +if __name__ == '__main__': + #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. + delete_galaxy() + main() +