comparison fasta2rdf/fastatordf.py @ 9:3f4f1cd22a6a

FASTA 2 RDF code cleanup
author jjkoehorst <jasperkoehorst@gmail.com>
date Sat, 21 Feb 2015 15:38:26 +0100
parents ec73c34af97b
children
comparison
equal deleted inserted replaced
7:c79025539d9b 9:3f4f1cd22a6a
1 #!/usr/bin/env python3.4 1 #!/usr/bin/env python3.4
2 # Author: Jasper Jan Koehorst 2 # Author: Jasper Jan Koehorst
3 # Date created: Jan 22 2015 3 # Date created: Jan 22 2015
4 # Function: generation of a RDF file from a genome fasta file 4 # Function: generation of a RDF file from a genome fasta file
5 5
6 def delete_galaxy():
7 import sys
8 for index, path in enumerate(sys.path):
9 if "galaxy-dist/" in path:
10 sys.path[index] = ''
11
12 #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function.
13 delete_galaxy()
14 6
15 # from io import StringIO 7 # from io import StringIO
16 from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin 8 from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin
17 # import rdflib 9 # import rdflib
18 from rdflib.store import Store 10 from rdflib.store import Store
24 URI = "http://csb.wur.nl/genome/" 16 URI = "http://csb.wur.nl/genome/"
25 global seeAlso 17 global seeAlso
26 seeAlso = "rdfs:seeAlso" 18 seeAlso = "rdfs:seeAlso"
27 global coreURI 19 global coreURI
28 coreURI = Namespace(URI) 20 coreURI = Namespace(URI)
21 global genomeGraph
22 store = plugin.get('IOMemory', Store)()
23 genomeGraph = Graph(store,URIRef(URI))
24 genomeGraph.bind("ssb",coreURI)
25
26 def delete_galaxy():
27 for index, path in enumerate(sys.path):
28 if "galaxy-dist/" in path:
29 sys.path[index] = ''
29 30
30 def createClass(uri): 31 def createClass(uri):
31 genomeGraph.add((uri,RDF.type,OWL.Class)) 32 genomeGraph.add((uri,RDF.type,OWL.Class))
32 genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing)) 33 genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
33 return uri 34 return uri
36 createClass(coreURI["Genome"]) #Genome class 37 createClass(coreURI["Genome"]) #Genome class
37 createClass(coreURI["Type"]) #Type class (Chr,Pls,Scaffold) 38 createClass(coreURI["Type"]) #Type class (Chr,Pls,Scaffold)
38 39
39 genomeDict = {} 40 genomeDict = {}
40 41
41 #requires chromosome_1, chromosome_2, chromosome_1... #For multiple scaffolds
42 # regex = re.compile('\[type=(.*?)\]')
43 sequence = "" 42 sequence = ""
44 genomeID = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_") 43 genomeID = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_")
45 if genomeID == 'None': 44 if genomeID == 'None':
46 genomeID = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_") 45 genomeID = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_")
47 46
55 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) 54 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
56 if '-ids' == element: 55 if '-ids' == element:
57 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) 56 genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1])))
58 57
59 genomeDict[genomeID] = {} 58 genomeDict[genomeID] = {}
60 # typDict = {"plasmid":0,"scaffold":0,"chromosome":0}
61 59
62 #Generating genome dictionary 60 #Generating genome dictionary
63 data = open(input_file).readlines() 61 data = open(input_file).readlines()
64 fastadict = {} 62 fastadict = {}
65 key = "" 63 key = ""
68 key = line.strip(">").strip() 66 key = line.strip(">").strip()
69 fastadict[key] = "" 67 fastadict[key] = ""
70 else: 68 else:
71 fastadict[key] += line.strip() 69 fastadict[key] += line.strip()
72 70
73 # for line in fastadict:
74 # typ = regex.findall(line)
75 # value = 0
76 #If something is found
77 # if len(typ) > 0:
78 # typ = typ[0]
79 #If something is not found
80 # elif typ == []:
81 # typ = "scaffold"
82 #If something is found but does not contain a value
83 # elif "_" in typ:
84 # value = typ.split("_")[-1]
85 # try:
86 # value = int(value)
87 # except:
88 # value = 1
89 #Not a integer
90
91 #If a value is not given it is automatically assigned as the first one
92 #If a value is given...
93 # if value > -1:
94 #If a second scaffold of a chromosome_1 is found
95 # if typ in genomeDict[genome]:
96 #Retrieve how many
97 # value = len(genomeDict[genome][typ]) + 1
98 # genomeDict[genome][typ]["scaffold_"+str(value)] = {"contig":fastadict[line]}
99 # else:
100 # genomeDict[genome][typ] = {}
101 # genomeDict[genome][typ]["scaffold_1"] = {"contig":fastadict[line]}
102
103 #Genome dictionary to TTL
104 genomeClass = createClass(coreURI["Genome"]) 71 genomeClass = createClass(coreURI["Genome"])
105 typeClass = createClass(coreURI["DnaObject"]) 72 typeClass = createClass(coreURI["DnaObject"])
106 for index, genome in enumerate(fastadict): 73 for index, genome in enumerate(fastadict):
107 # for typ in genomeDict[genome]: 74 typeURI = coreURI[genomeID + "/dnaobject_" + str(index)]
108 # for scaf in genomeDict[genome][typ]: 75 sequence = fastadict[genome]
109 # for con in genomeDict[genome][typ][scaf]: 76 genomeGraph.add((genomeURI, coreURI["dnaobject"] , typeURI))
110 #A note is required here... 77 genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
111 #Due to RDF performances we are reducing the amount of triples needed from a genome to a contig. 78 genomeGraph.add((typeURI, coreURI["sequence"] , Literal(sequence)))
112 #Previously it was 79 genomeGraph.add((typeURI, coreURI["header"], Literal(genome)))
113 # Genome > Class > Scaffold > Contig 80 genomeGraph.add((typeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
114 #Now it will be 81 genomeGraph.add((genomeURI, RDF.type,genomeClass))
115 # Genome > Class/Scaffold/Contig 82 genomeGraph.add((typeURI, RDF.type,typeClass))
116 #typeURI = coreURI[genome + "/" + typ]
117 #scaffoldURI = coreURI[genome + "/" + typ + "/" + scaf]
118 #Was contigURI
119 typeURI = coreURI[genomeID + "/dnaobject_" + str(index)] # + "/" + scaf + "/" + con]
120 # sequence = genomeDict[genome][typ][scaf][con]
121 sequence = fastadict[genome]
122 genomeGraph.add((genomeURI, coreURI["dnaobject"] , typeURI))
123 genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
124 genomeGraph.add((typeURI, coreURI["sequence"] , Literal(sequence)))
125 genomeGraph.add((typeURI, coreURI["header"], Literal(genome)))
126 genomeGraph.add((typeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
127 genomeGraph.add((genomeURI, RDF.type,genomeClass))
128 genomeGraph.add((typeURI, RDF.type,typeClass))
129 83
130 def save(): 84 def save():
131 data = genomeGraph.serialize(format='turtle') 85 data = genomeGraph.serialize(format='turtle')
132 open(sys.argv[sys.argv.index("-output")+1],"wb").write(data) 86 open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
133 87
134 def main(): 88 def main():
135 store = plugin.get('IOMemory', Store)()
136 global genomeGraph
137 genomeGraph = Graph(store,URIRef(URI))
138 genomeGraph.bind("ssb",coreURI)
139 input_file = sys.argv[sys.argv.index("-input")+1] 89 input_file = sys.argv[sys.argv.index("-input")+1]
140 fasta_parser(input_file) 90 fasta_parser(input_file)
141 save() 91 save()
142 92
143 if __name__ == '__main__': 93 if __name__ == '__main__':
94 #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function.
95 delete_galaxy()
144 main() 96 main()
145 97