comparison conversion/fasta2rdf/fastatordf.py @ 29:dd59731d50b5

Biopython and python3.4 inclusion test.
author Jasper Koehorst <jasperkoehorst@gmail.com>
date Wed, 25 Feb 2015 08:25:32 +0100
parents 74b8ba5e2d5b
children
comparison
equal deleted inserted replaced
28:3501912295fb 29:dd59731d50b5
1 #!/usr/bin/env python3.4
2 # Author: Jasper Jan Koehorst
3 # Date created: Jan 22 2015
4 # Function: generation of a RDF file from a genome fasta file
5
6
7 # from io import StringIO
8 from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin
9 # import rdflib
10 from rdflib.store import Store
11 import sys
12
13 store = plugin.get('IOMemory', Store)()
14
15 global URI
16 URI = "http://csb.wur.nl/genome/"
17 global seeAlso
18 seeAlso = "rdfs:seeAlso"
19 global coreURI
20 coreURI = Namespace(URI)
21 global genomeGraph
22 store = plugin.get('IOMemory', Store)()
23 genomeGraph = Graph(store,URIRef(URI))
24 genomeGraph.bind("ssb",coreURI)
25
26 def delete_galaxy():
27 for index, path in enumerate(sys.path):
28 if "galaxy-dist/" in path:
29 sys.path[index] = ''
30
31 def createClass(uri):
32 genomeGraph.add((uri,RDF.type,OWL.Class))
33 genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
34 return uri
35
36 def fasta_parser(input_file):
37 createClass(coreURI["Genome"]) #Genome class
38 createClass(coreURI["Type"]) #Type class (Chr,Pls,Scaffold)
39
40 genomeDict = {}
41
42 sequence = ""
43 genomeID = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_")
44 if genomeID == 'None':
45 genomeID = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_")
46
47 genomeURI = coreURI[genomeID]
48 for index, element in enumerate(sys.argv):
49 if '-organism' == element:
50 genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1])))
51 if '-ncbi_taxid' == element:
52 genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1])))
53 if '-idtag' == element:
54 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
55 if '-ids' == element:
56 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
57
58 genomeDict[genomeID] = {}
59
60 #Generating genome dictionary
61 data = open(input_file).readlines()
62 fastadict = {}
63 key = ""
64 for index, line in enumerate(data):
65 if ">" == line[0]:
66 key = line.strip(">").strip()
67 fastadict[key] = ""
68 else:
69 fastadict[key] += line.strip()
70
71 genomeClass = createClass(coreURI["Genome"])
72 typeClass = createClass(coreURI["DnaObject"])
73 for index, genome in enumerate(fastadict):
74 typeURI = coreURI[genomeID + "/dnaobject_" + str(index)]
75 sequence = fastadict[genome]
76 genomeGraph.add((genomeURI, coreURI["dnaobject"] , typeURI))
77 genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
78 genomeGraph.add((typeURI, coreURI["sequence"] , Literal(sequence)))
79 genomeGraph.add((typeURI, coreURI["header"], Literal(genome)))
80 genomeGraph.add((typeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
81 genomeGraph.add((genomeURI, RDF.type,genomeClass))
82 genomeGraph.add((typeURI, RDF.type,typeClass))
83
84 def save():
85 data = genomeGraph.serialize(format='turtle')
86 open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
87
88 def main():
89 input_file = sys.argv[sys.argv.index("-input")+1]
90 fasta_parser(input_file)
91 save()
92
93 if __name__ == '__main__':
94 #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function.
95 delete_galaxy()
96 main()
97