sapp: conversion/gbk2rdf/gbktordf.py annotate

author	jjkoehorst
date	Sat, 21 Feb 2015 11:26:51 -0500 (2015-02-21)
parents	74b8ba5e2d5b
children

rev	line source
16 74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	1 #!/usr/bin/env python3.4
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	2 # Author: Jasper Jan Koehorst
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	3 # Date created: Feb 21 2015
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	4 # Function: generation of a RDF file from Genbank/EMBL
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	5
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	6 import warnings
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	7 warnings.filterwarnings("ignore")
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	8
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	9 def delete_galaxy():
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	10 import sys
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	11 for index, path in enumerate(sys.path):
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	12 if "galaxy-dist/" in path:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	13 sys.path[index] = ''
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	14
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	15 #Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. This is not an elegant solution but it works for now.
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	16 delete_galaxy()
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	17
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	18 from Bio import SeqIO
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	19 # Import RDFLib's default Graph implementation.
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	20 import os, sys
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	21 from Bio.Seq import Seq
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	22
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	23 from rdflib import Graph, URIRef, Literal,Namespace,RDF,RDFS,OWL, plugin
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	24 from rdflib.store import Store
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	25 import hashlib
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	26 store = plugin.get('IOMemory', Store)()
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	27
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	28 global URI
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	29 URI = "http://csb.wur.nl/genome/"
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	30 global seeAlso
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	31 seeAlso = "rdfs:seeAlso"
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	32 global coreURI
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	33 coreURI = Namespace(URI)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	34
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	35 global SubClassOfDict
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	36 SubClassOfDict = {}
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	37 global SubClassOfDictRna
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	38 SubClassOfDictRna = {}
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	39
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	40 def createClass(uri, root=True):
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	41 genomeGraph.add((uri,RDF.type,OWL.Class))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	42 if root:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	43 genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	44 return uri
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	45
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	46 def tmp():
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	47 import time
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	48 global tmpFolder
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	49 tmpFolder = "/tmp/"+str(time.time())+"/"
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	50 os.mkdir(tmpFolder)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	51
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	52 def cleantmp():
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	53 os.system("ls "+tmpFolder)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	54 os.system("rm -rf "+tmpFolder)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	55
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	56 def crawler():
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	57 #From input folder it looks for GBK file (gz files are in progress)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	58 input_file = sys.argv[sys.argv.index("-input")+1]
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	59 gbk_parser(input_file)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	60
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	61 def gbk_parser():
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	62 prevObjStart = -1
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	63 prevObjStop = -1
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	64 store = plugin.get('IOMemory', Store)()
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	65 global genomeGraph
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	66 genomeGraph = Graph(store,URIRef(URI))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	67 genomeGraph.bind("ssb",coreURI)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	68 input_file = sys.argv[sys.argv.index("-input")+1]
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	69
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	70 #CLASS definitions
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	71 genomeClass = createClass(coreURI["Genome"], root=True)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	72 typeClass = createClass(coreURI["DnaObject"], root=True)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	73 createClass(coreURI["Protein"], root=True)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	74 pubmedClass = createClass(coreURI["Pubmed"], root=True)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	75 miscClass = createClass(coreURI["MiscFeature"], root=False)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	76 createClass(coreURI["Feature"], root=True)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	77 SubClassOfDict["MiscFeature"] = 1
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	78 SubClassOfDictRna["Trna"] = 1
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	79 SubClassOfDictRna["Rrna"] = 1
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	80 SubClassOfDictRna["Tmrna"] = 1
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	81 SubClassOfDictRna["Ncrna"] = 1
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	82
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	83 # codon = "11" #Default initialization if no CDS are present
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	84 ##################
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	85 weird_chars = list(''',./?<>:;"'\|\}]{[+=_-)(*&^%$#@!±§~` ''')
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	86 scaf_value = 0
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	87 #Which files are already done
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	88 ########
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	89 formatGBK = sys.argv[sys.argv.index("-format")+1]
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	90 for record in SeqIO.parse(input_file, formatGBK):
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	91 #Read first feature for genome name and information...
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	92 #Ignore the empty GBK file due to the lack of features?
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	93
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	94 for index, feature in enumerate(record.features):
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	95 if index == 0:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	96 if "-identifier" in sys.argv:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	97 genome = sys.argv[sys.argv.index("-identifier")+1]
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	98 else:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	99 try:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	100 genome = feature.qualifiers["organism"][0].replace(" ","_")
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	101 except:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	102 #BUG: THIS IS A TEMP FIX, USE GALAXY -IDENTIFIER TO CAPTURE THIS
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	103 genome = "XNoneX"
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	104 for char in weird_chars:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	105 genome = genome.replace(char,"_")
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	106
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	107 try:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	108 gi = record.annotations["gi"]
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	109 typ = str(gi)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	110 except:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	111 try:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	112 gi = record.annotations["accessions"][0]
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	113 typ = str(gi)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	114 except:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	115 scaf_value += 1
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	116 typ = "scaffold_"+str(scaf_value)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	117 genomeURI = coreURI[genome]
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	118 gbkURI = coreURI[genome + "/" + typ]
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	119 #To contig connection to connect all data to it
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	120 genomeGraph.add((genomeURI, coreURI["dnaobject"] , gbkURI))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	121
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	122 #General genome features also stored in the class...
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	123 if "genome" in feature.qualifiers:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	124 genomeGraph.add((genomeURI, coreURI["organism"],Literal(feature.qualifiers["organism"][0])))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	125 if "strain" in feature.qualifiers:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	126 genomeGraph.add((genomeURI, coreURI["strain"],Literal(feature.qualifiers["strain"][0])))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	127 if "taxonomy" in record.annotations:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	128 for taxon in record.annotations["taxonomy"]:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	129 genomeGraph.add((genomeURI, coreURI["taxonomy"],Literal(taxon)))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	130 record.annotations["taxonomy"] = []
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	131 #Genome sequence#
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	132 sequence = str(record.seq)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	133 #Verify if sequence was not empty and is now full of X or N
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	134 filtered_sequence = sequence.replace("X","").replace("N","")
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	135 if len(filtered_sequence) == 0:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	136 sequence = ""
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	137 #Record parsing#
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	138 for annot in record.annotations:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	139 if type(record.annotations[annot]) == list:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	140 if annot == "references":
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	141 for references in record.annotations[annot]:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	142 if references.pubmed_id != "":
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	143 pubmed = references.pubmed_id
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	144 genomeGraph.add((gbkURI, coreURI[annot.lower()] , coreURI["pubmed/"+pubmed]))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	145 obj_dict = references.__dict__
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	146 for key in obj_dict:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	147 genomeGraph.add((coreURI["pubmed/"+pubmed], coreURI[key.lower()], Literal(str(obj_dict[key]))))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	148 genomeGraph.add((coreURI["pubmed/"+pubmed], RDF.type, pubmedClass))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	149
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	150 else:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	151 for a in record.annotations[annot]:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	152 int_add(gbkURI,coreURI[annot.lower()],str(a))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	153 else:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	154 int_add(gbkURI,coreURI[annot.lower()],str(record.annotations[annot]))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	155
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	156 #####END of RECORD####
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	157 if len(sequence) > 0:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	158 genomeGraph.add((gbkURI, coreURI["sequence"] , Literal(sequence)))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	159 genomeGraph.add((genomeURI, RDF.type,genomeClass))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	160 genomeGraph.add((gbkURI, RDF.type,typeClass))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	161 for key in feature.qualifiers:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	162 genomeGraph.add((gbkURI, coreURI[key.lower()] , Literal(feature.qualifiers[key][0])))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	163 #break
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	164 else: #The rest of the GBK file
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	165 feature_type = feature.type
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	166 end = str(feature.location.end).replace(">","").replace("<","")
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	167 start = str(feature.location.start).replace(">","").replace("<","")
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	168
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	169 strand = str(feature.location.strand)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	170
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	171 if strand == 'None':
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	172 strand = 0
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	173 else:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	174 if feature.type == "misc_feature": #Store as part of previous cds or something...
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	175 if strand == "-1":
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	176 miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(end)+"_"+str(start)]
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	177 else:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	178 miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(start)+"_"+str(end)]
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	179
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	180 # TODO: Check if biopython has an overlap function...
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	181 if int(prevObjStart) <= int(start):
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	182 if int(end) <= int(prevObjStop):
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	183 pass
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	184 # genomeGraph.add((typeURI,coreURI["feature"],miscURI))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	185 # genomeGraph.add((miscURI,RDF.type,miscClass))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	186 else:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	187 genomeGraph.add((gbkURI, coreURI["feature"] , miscURI))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	188 genomeGraph.add((miscURI,RDF.type,miscClass))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	189 else:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	190 genomeGraph.add((gbkURI, coreURI["feature"] , miscURI))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	191 genomeGraph.add((miscURI,RDF.type,miscClass))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	192
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	193 store_general_information(miscURI,feature,record)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	194 else:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	195 prevObjStart = start
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	196 prevObjStop = end
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	197
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	198 if strand == "-1":
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	199 typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(end)+"_"+str(start)]
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	200 else:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	201 typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(start)+"_"+str(end)]
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	202
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	203 #Contig specific connection
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	204 genomeGraph.add((gbkURI, coreURI["feature"] , typeURI))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	205 ############################
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	206
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	207 store_general_information(typeURI,feature,record)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	208
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	209 for subfeature in feature.sub_features:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	210 strand = str(subfeature.location.strand)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	211 subfeature_type = subfeature.type
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	212 end = str(subfeature.location.end).replace(">","").replace("<","")
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	213 start = str(subfeature.location.start).replace(">","").replace("<","")
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	214
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	215 if strand == "-1":
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	216 subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(end)+"_"+str(start)]
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	217 else:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	218 subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(start)+"_"+str(end)]
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	219 genomeGraph.add((typeURI, coreURI["feature"] , subURI))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	220 store_general_information(subURI,subfeature,record,feature)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	221
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	222
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	223 def store_general_information(generalURI,feature,record,superfeature=""):
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	224 proteinClass = createClass(coreURI["Protein"], root=True)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	225 sequence = str(record.seq)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	226 cds_sequence = str(feature.extract(sequence))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	227 #Fixes the 0 count instead of 1-count in biopython vs humans
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	228 feature_type = feature.type
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	229 end = str(feature.location.end).replace(">","").replace("<","")
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	230 start = str(feature.location.start).replace(">","").replace("<","")
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	231 strand = str(feature.location.strand)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	232 if strand == "None":
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	233 strand = 0
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	234
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	235 genomeGraph.add((generalURI,coreURI["sourcedb"],Literal(sys.argv[sys.argv.index("-sourcedb")+1])))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	236
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	237 if strand == "-1":
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	238 genomeGraph.add((generalURI,coreURI["end"],Literal(int(start)+1)))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	239 genomeGraph.add((generalURI,coreURI["begin"],Literal(int(end))))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	240 else:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	241 genomeGraph.add((generalURI,coreURI["begin"],Literal(int(start)+1)))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	242 genomeGraph.add((generalURI,coreURI["end"],Literal(int(end))))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	243 genomeGraph.add((generalURI,coreURI["strand"],Literal(int(strand))))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	244 if feature.type != "misc_feature":
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	245 try:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	246 genomeGraph.add((generalURI,coreURI["sequence"],Literal(cds_sequence)))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	247 except: #When protein sequence is not given for whatever reason
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	248 print ("wrong?")
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	249
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	250 if feature.type == "misc_feature":
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	251 pass
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	252 else:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	253 genomeGraph.add((generalURI,RDF.type,createClass(coreURI[feature_type.lower().title()], root=False)))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	254 if feature_type.lower() != "rrna" and feature_type.lower() != "trna" and feature_type.lower() != "tmrna" and feature_type.lower() != "ncrna":
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	255 SubClassOfDict[feature_type.lower().title()] = 1
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	256 for key in feature.qualifiers:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	257 values = feature.qualifiers[key]
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	258 if key == "translation":
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	259 pass
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	260 elif type(values) == list:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	261 for v in values:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	262 int_add(generalURI,coreURI[key.lower()],v)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	263 else:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	264 int_add(generalURI,coreURI[key.lower()],values)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	265 if feature.type == "CDS":
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	266 try:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	267 #Feature is normally submitted to this function
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	268 #IF a subfeature is submitted it is submitted as a feature
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	269 #And subfeature variable will contain the superfeature
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	270 if superfeature:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	271 codon = superfeature.qualifiers["transl_table"][0]
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	272 except:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	273 #Default codon table 11
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	274 codon = "11"
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	275 #Protein linkage
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	276 translation = ""
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	277 try:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	278 translation = feature.qualifiers["translation"][0].strip("*")
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	279 except KeyError:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	280 #When protein sequence is not given...
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	281 if len(feature.location.parts) > 1:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	282 #Exon boundaries?
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	283 seq = ''
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	284 for loc in feature.location:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	285 seq += record.seq[loc]
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	286 if int(feature.location.strand) == -1:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	287 seq = Seq(seq).complement()
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	288 else:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	289 seq = Seq(seq)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	290 translation = str(seq.translate(feature.qualifiers["transl_table"][0]))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	291 elif int(feature.location.strand) == -1:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	292 if str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon)).strip("*") != translation:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	293 if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	294 translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	295 else:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	296 translation = ''
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	297 elif int(feature.location.strand) == +1:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	298 if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	299 translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].translate(codon))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	300 else:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	301 translation = ''
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	302
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	303 if translation:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	304 translation = list(translation)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	305 translation[0] = "M"
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	306 translation = ''.join(translation).strip("*")
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	307 if "*" in translation:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	308 pass
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	309
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	310 translation = translation.encode('utf-8')
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	311 md5_protein = hashlib.md5(translation).hexdigest()
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	312 proteinURI = coreURI["protein/"+md5_protein]
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	313 genomeGraph.add((generalURI,coreURI["protein"],proteinURI))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	314 for key in feature.qualifiers:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	315 for v in feature.qualifiers[key]:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	316 if key == "translation":
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	317 genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein)))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	318 genomeGraph.add((proteinURI,coreURI["sequence"],Literal(translation)))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	319 genomeGraph.add((proteinURI,RDF.type,proteinClass))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	320 else:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	321 for v in feature.qualifiers[key]:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	322 int_add(generalURI,coreURI[key.lower()],v)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	323
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	324 def int_add(subject, predicate, obj):
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	325 try:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	326 object_float = float(obj.replace('"',''))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	327 object_int = int(obj.replace('"',''))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	328 if object_int == object_float:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	329 genomeGraph.add((subject,predicate,Literal(object_int)))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	330 else:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	331 genomeGraph.add((subject,predicate,Literal(object_float)))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	332 except:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	333 genomeGraph.add((subject,predicate,Literal(obj.replace('"',''))))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	334
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	335 def save():
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	336 data = genomeGraph.serialize(format='turtle')
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	337 open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	338
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	339 def subClassOfBuilder():
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	340 for subclass in SubClassOfDict:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	341 genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	342 genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Feature"]))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	343
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	344 def subClassOfBuilderRna():
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	345 for subclass in SubClassOfDictRna:
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	346 genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	347 genomeGraph.add((coreURI["Rna"],RDFS.subClassOf,coreURI["Feature"]))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	348 genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"]))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	349 genomeGraph.add((coreURI[subclass],RDF.type,OWL.Class))
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	350
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	351 def main():
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	352 tmp()
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	353 gbk_parser()
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	354 subClassOfBuilder()
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	355 subClassOfBuilderRna()
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	356 save()
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	357 cleantmp()
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	358
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	359 if __name__ == "__main__":
74b8ba5e2d5b aragorn addition jjkoehorst <jasperkoehorst@gmail.com> parents: diff changeset	360 main()

16

74b8ba5e2d5b aragorn addition