Mercurial > repos > jjkoehorst > sapp
view fasta2rdf/fastatordf.py @ 6:ec73c34af97b
FASTA2RDF
author | jjkoehorst <jasperkoehorst@gmail.com> |
---|---|
date | Sat, 21 Feb 2015 15:19:42 +0100 |
parents | |
children | 3f4f1cd22a6a |
line wrap: on
line source
#!/usr/bin/env python3.4 # Author: Jasper Jan Koehorst # Date created: Jan 22 2015 # Function: generation of a RDF file from a genome fasta file def delete_galaxy(): import sys for index, path in enumerate(sys.path): if "galaxy-dist/" in path: sys.path[index] = '' #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. delete_galaxy() # from io import StringIO from rdflib import Graph, URIRef, Literal,Namespace, RDF,RDFS,OWL, plugin # import rdflib from rdflib.store import Store import sys store = plugin.get('IOMemory', Store)() global URI URI = "http://csb.wur.nl/genome/" global seeAlso seeAlso = "rdfs:seeAlso" global coreURI coreURI = Namespace(URI) def createClass(uri): genomeGraph.add((uri,RDF.type,OWL.Class)) genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing)) return uri def fasta_parser(input_file): createClass(coreURI["Genome"]) #Genome class createClass(coreURI["Type"]) #Type class (Chr,Pls,Scaffold) genomeDict = {} #requires chromosome_1, chromosome_2, chromosome_1... #For multiple scaffolds # regex = re.compile('\[type=(.*?)\]') sequence = "" genomeID = sys.argv[sys.argv.index('-idtag')+1].replace(" ","_") if genomeID == 'None': genomeID = sys.argv[sys.argv.index('-id_alternative')+1].replace(" ","_").replace(".","_") genomeURI = coreURI[genomeID] for index, element in enumerate(sys.argv): if '-organism' == element: genomeGraph.add((genomeURI, coreURI["organism"] , Literal(sys.argv[index+1]))) if '-ncbi_taxid' == element: genomeGraph.add((genomeURI, coreURI["taxonomy"] , Literal(sys.argv[index+1]))) if '-idtag' == element: genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) if '-ids' == element: genomeGraph.add((genomeURI, coreURI["id_tag"] , Literal(sys.argv[index+1]))) genomeDict[genomeID] = {} # typDict = {"plasmid":0,"scaffold":0,"chromosome":0} #Generating genome dictionary data = open(input_file).readlines() fastadict = {} key = "" for index, line in enumerate(data): if ">" == line[0]: key = line.strip(">").strip() fastadict[key] = "" else: fastadict[key] += line.strip() # for line in fastadict: # typ = regex.findall(line) # value = 0 #If something is found # if len(typ) > 0: # typ = typ[0] #If something is not found # elif typ == []: # typ = "scaffold" #If something is found but does not contain a value # elif "_" in typ: # value = typ.split("_")[-1] # try: # value = int(value) # except: # value = 1 #Not a integer #If a value is not given it is automatically assigned as the first one #If a value is given... # if value > -1: #If a second scaffold of a chromosome_1 is found # if typ in genomeDict[genome]: #Retrieve how many # value = len(genomeDict[genome][typ]) + 1 # genomeDict[genome][typ]["scaffold_"+str(value)] = {"contig":fastadict[line]} # else: # genomeDict[genome][typ] = {} # genomeDict[genome][typ]["scaffold_1"] = {"contig":fastadict[line]} #Genome dictionary to TTL genomeClass = createClass(coreURI["Genome"]) typeClass = createClass(coreURI["DnaObject"]) for index, genome in enumerate(fastadict): # for typ in genomeDict[genome]: # for scaf in genomeDict[genome][typ]: # for con in genomeDict[genome][typ][scaf]: #A note is required here... #Due to RDF performances we are reducing the amount of triples needed from a genome to a contig. #Previously it was # Genome > Class > Scaffold > Contig #Now it will be # Genome > Class/Scaffold/Contig #typeURI = coreURI[genome + "/" + typ] #scaffoldURI = coreURI[genome + "/" + typ + "/" + scaf] #Was contigURI typeURI = coreURI[genomeID + "/dnaobject_" + str(index)] # + "/" + scaf + "/" + con] # sequence = genomeDict[genome][typ][scaf][con] sequence = fastadict[genome] genomeGraph.add((genomeURI, coreURI["dnaobject"] , typeURI)) genomeGraph.add((genomeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) genomeGraph.add((typeURI, coreURI["sequence"] , Literal(sequence))) genomeGraph.add((typeURI, coreURI["header"], Literal(genome))) genomeGraph.add((typeURI, coreURI["sourcedb"], Literal(sys.argv[sys.argv.index("-sourcedb")+1]))) genomeGraph.add((genomeURI, RDF.type,genomeClass)) genomeGraph.add((typeURI, RDF.type,typeClass)) def save(): data = genomeGraph.serialize(format='turtle') open(sys.argv[sys.argv.index("-output")+1],"wb").write(data) def main(): store = plugin.get('IOMemory', Store)() global genomeGraph genomeGraph = Graph(store,URIRef(URI)) genomeGraph.bind("ssb",coreURI) input_file = sys.argv[sys.argv.index("-input")+1] fasta_parser(input_file) save() if __name__ == '__main__': main()