Mercurial > repos > jjkoehorst > sapp
comparison conversion/fasta2rdf/fastatordf.py @ 16:74b8ba5e2d5b
aragorn addition
author | jjkoehorst <jasperkoehorst@gmail.com> |
---|---|
date | Sat, 21 Feb 2015 17:17:06 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
15:10cad758ed0f | 16:74b8ba5e2d5b |
---|---|
1 #!/usr/bin/env python3.4 | |
2 # Author: Jasper Jan Koehorst | |
3 # Date created: Jan 22 2015 | |
4 # Function: generation of a RDF file from a genome fasta file | |
5 | |
6 | |
7 # from io import StringIO | |
8 from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin | |
9 # import rdflib | |
10 from rdflib.store import Store | |
11 import sys | |
12 | |
13 store = plugin.get('IOMemory', Store)() | |
14 | |
15 global URI | |
16 URI = "http://csb.wur.nl/genome/" | |
17 global seeAlso | |
18 seeAlso = "rdfs:seeAlso" | |
19 global coreURI | |
20 coreURI = Namespace(URI) | |
21 global genomeGraph | |
22 store = plugin.get('IOMemory', Store)() | |
23 genomeGraph = Graph(store,URIRef(URI)) | |
24 genomeGraph.bind("ssb",coreURI) | |
25 | |
26 def delete_galaxy(): | |
27 for index, path in enumerate(sys.path): | |
28 if "galaxy-dist/" in path: | |
29 sys.path[index] = '' | |
30 | |
31 def createClass(uri): | |
32 genomeGraph.add((uri,RDF.type,OWL.Class)) | |
33 genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing)) | |
34 return uri | |
35 | |
36 def fasta_parser(input_file): | |
37 createClass(coreURI["Genome"]) #Genome class | |
38 createClass(coreURI["Type"]) #Type class (Chr,Pls,Scaffold) | |
39 | |
40 genomeDict = {} | |
41 | |
42 sequence = "" | |
43 genomeID = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_") | |
44 if genomeID == 'None': | |
45 genomeID = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_") | |
46 | |
47 genomeURI = coreURI[genomeID] | |
48 for index, element in enumerate(sys.argv): | |
49 if '-organism' == element: | |
50 genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1]))) | |
51 if '-ncbi_taxid' == element: | |
52 genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1]))) | |
53 if '-idtag' == element: | |
54 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) | |
55 if '-ids' == element: | |
56 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) | |
57 | |
58 genomeDict[genomeID] = {} | |
59 | |
60 #Generating genome dictionary | |
61 data = open(input_file).readlines() | |
62 fastadict = {} | |
63 key = "" | |
64 for index, line in enumerate(data): | |
65 if ">" == line[0]: | |
66 key = line.strip(">").strip() | |
67 fastadict[key] = "" | |
68 else: | |
69 fastadict[key] += line.strip() | |
70 | |
71 genomeClass = createClass(coreURI["Genome"]) | |
72 typeClass = createClass(coreURI["DnaObject"]) | |
73 for index, genome in enumerate(fastadict): | |
74 typeURI = coreURI[genomeID + "/dnaobject_" + str(index)] | |
75 sequence = fastadict[genome] | |
76 genomeGraph.add((genomeURI, coreURI["dnaobject"] , typeURI)) | |
77 genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) | |
78 genomeGraph.add((typeURI, coreURI["sequence"] , Literal(sequence))) | |
79 genomeGraph.add((typeURI, coreURI["header"], Literal(genome))) | |
80 genomeGraph.add((typeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) | |
81 genomeGraph.add((genomeURI, RDF.type,genomeClass)) | |
82 genomeGraph.add((typeURI, RDF.type,typeClass)) | |
83 | |
84 def save(): | |
85 data = genomeGraph.serialize(format='turtle') | |
86 open(sys.argv[sys.argv.index("-output")+1],"wb").write(data) | |
87 | |
88 def main(): | |
89 input_file = sys.argv[sys.argv.index("-input")+1] | |
90 fasta_parser(input_file) | |
91 save() | |
92 | |
93 if __name__ == '__main__': | |
94 #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. | |
95 delete_galaxy() | |
96 main() | |
97 |