view gbk2rdf/gbktordf.py @ 7:c79025539d9b

FASTA to RDF
author jjkoehorst <jasperkoehorst@gmail.com>
date Sat, 21 Feb 2015 15:23:15 +0100
parents ec73c34af97b
children
line wrap: on
line source

#!/usr/bin/env python3.4
# Author: Jasper Jan Koehorst
# Date created: Feb 21 2015
# Function: generation of a RDF file from Genbank/EMBL

import warnings
warnings.filterwarnings("ignore")

def delete_galaxy():
	import sys
	for index, path in enumerate(sys.path):
		if "galaxy-dist/" in path:
			sys.path[index] = ''

#Some modules that are required by RDFLIB are also in galaxy, this messes up the RDF import function. This is not an elegant solution but it works for now.
delete_galaxy()

from Bio import SeqIO
# Import RDFLib's default Graph implementation.
import os, sys
from Bio.Seq import Seq

from rdflib import Graph, URIRef, Literal,Namespace,RDF,RDFS,OWL, plugin
from rdflib.store import Store
import hashlib
store = plugin.get('IOMemory', Store)()

global URI
URI = "http://csb.wur.nl/genome/"
global seeAlso
seeAlso = "rdfs:seeAlso"
global coreURI
coreURI = Namespace(URI)

global SubClassOfDict
SubClassOfDict = {}
global SubClassOfDictRna
SubClassOfDictRna = {}

def createClass(uri, root=True):
	genomeGraph.add((uri,RDF.type,OWL.Class))
	if root:
		genomeGraph.add((uri,RDFS.subClassOf,OWL.Thing))
	return uri

def tmp():
	import time
	global tmpFolder
	tmpFolder = "/tmp/"+str(time.time())+"/"
	os.mkdir(tmpFolder)

def cleantmp():
	os.system("ls "+tmpFolder)
	os.system("rm -rf "+tmpFolder)

def crawler():
	#From input folder it looks for GBK file (gz files are in progress)
	input_file = sys.argv[sys.argv.index("-input")+1]
	gbk_parser(input_file)

def gbk_parser():
	prevObjStart = -1
	prevObjStop = -1	
	store = plugin.get('IOMemory', Store)()
	global genomeGraph
	genomeGraph = Graph(store,URIRef(URI))
	genomeGraph.bind("ssb",coreURI)
	input_file = sys.argv[sys.argv.index("-input")+1]

	#CLASS definitions
	genomeClass = createClass(coreURI["Genome"], root=True)
	typeClass = createClass(coreURI["DnaObject"], root=True)
	createClass(coreURI["Protein"], root=True)
	pubmedClass = createClass(coreURI["Pubmed"], root=True)
	miscClass = createClass(coreURI["MiscFeature"], root=False)
	createClass(coreURI["Feature"], root=True)
	SubClassOfDict["MiscFeature"] = 1
	SubClassOfDictRna["Trna"] = 1
	SubClassOfDictRna["Rrna"] = 1
	SubClassOfDictRna["Tmrna"] = 1
	SubClassOfDictRna["Ncrna"] = 1

# 	codon = "11" #Default initialization if no CDS are present
	##################
	weird_chars = list(''',./?<>:;"'|\}]{[+=_-)(*&^%$#@!±§~` ''')
	scaf_value = 0
	#Which files are already done
	########
	formatGBK = sys.argv[sys.argv.index("-format")+1]
	for record in SeqIO.parse(input_file, formatGBK):
		#Read first feature for genome name and information...
		#Ignore the empty GBK file due to the lack of features?

		for index, feature in enumerate(record.features):
			if index == 0:
				if "-identifier" in sys.argv:
					genome = sys.argv[sys.argv.index("-identifier")+1]
				else:
					try:
						genome = feature.qualifiers["organism"][0].replace(" ","_")
					except:
						#BUG: THIS IS A TEMP FIX, USE GALAXY -IDENTIFIER TO CAPTURE THIS
						genome = "XNoneX"
				for char in weird_chars:
					genome = genome.replace(char,"_")

				try:
					gi = record.annotations["gi"]
					typ = str(gi)
				except:
					try:
						gi = record.annotations["accessions"][0]
						typ = str(gi)
					except:
						scaf_value += 1
						typ = "scaffold_"+str(scaf_value)
				genomeURI = coreURI[genome]
				gbkURI = coreURI[genome + "/" + typ]
				#To contig connection to connect all data to it
				genomeGraph.add((genomeURI, coreURI["dnaobject"] , gbkURI))

				#General genome features also stored in the class...
				if "genome" in feature.qualifiers:
					genomeGraph.add((genomeURI, coreURI["organism"],Literal(feature.qualifiers["organism"][0])))
				if "strain" in feature.qualifiers:
					genomeGraph.add((genomeURI, coreURI["strain"],Literal(feature.qualifiers["strain"][0])))
				if "taxonomy" in record.annotations:
					for taxon in record.annotations["taxonomy"]:
						genomeGraph.add((genomeURI, coreURI["taxonomy"],Literal(taxon)))
					record.annotations["taxonomy"] = []
				#Genome sequence#
				sequence = str(record.seq)
				#Verify if sequence was not empty and is now full of X or N
				filtered_sequence = sequence.replace("X","").replace("N","")
				if len(filtered_sequence) == 0:
					sequence = ""
				#Record parsing#
				for annot in record.annotations:
					if type(record.annotations[annot]) == list:
						if annot == "references":
							for references in record.annotations[annot]:
								if references.pubmed_id != "":
									pubmed = references.pubmed_id
									genomeGraph.add((gbkURI, coreURI[annot.lower()] , coreURI["pubmed/"+pubmed]))
									obj_dict = references.__dict__
									for key in obj_dict:
										genomeGraph.add((coreURI["pubmed/"+pubmed], coreURI[key.lower()], Literal(str(obj_dict[key]))))
									genomeGraph.add((coreURI["pubmed/"+pubmed], RDF.type, pubmedClass))
									
						else:
							for a in record.annotations[annot]:
								int_add(gbkURI,coreURI[annot.lower()],str(a))
					else:
						int_add(gbkURI,coreURI[annot.lower()],str(record.annotations[annot]))
						
				#####END of RECORD####
				if len(sequence) > 0:
					genomeGraph.add((gbkURI, coreURI["sequence"] ,  Literal(sequence)))
				genomeGraph.add((genomeURI, RDF.type,genomeClass))
				genomeGraph.add((gbkURI, RDF.type,typeClass))
				for key in feature.qualifiers:
					genomeGraph.add((gbkURI, coreURI[key.lower()] , Literal(feature.qualifiers[key][0])))
				#break
			else: #The rest of the GBK file
				feature_type = feature.type
				end = str(feature.location.end).replace(">","").replace("<","")
				start = str(feature.location.start).replace(">","").replace("<","")
				
				strand = str(feature.location.strand)

				if strand == 'None':
					strand = 0
				else:
					if feature.type == "misc_feature": #Store as part of previous cds or something...
						if strand == "-1":
							miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(end)+"_"+str(start)]
						else:
							miscURI = coreURI[genome + "/" + typ + "/"+feature_type+"/gbk/"+str(start)+"_"+str(end)]
						
						# TODO: Check if biopython has an overlap function...
						if int(prevObjStart) <= int(start):
							if int(end) <= int(prevObjStop):
								pass
# 								genomeGraph.add((typeURI,coreURI["feature"],miscURI))
# 								genomeGraph.add((miscURI,RDF.type,miscClass))
							else:
								genomeGraph.add((gbkURI, coreURI["feature"] , miscURI))
								genomeGraph.add((miscURI,RDF.type,miscClass))
						else:
							genomeGraph.add((gbkURI, coreURI["feature"] , miscURI))
							genomeGraph.add((miscURI,RDF.type,miscClass))

						store_general_information(miscURI,feature,record)
					else:
						prevObjStart = start
						prevObjStop = end
						
						if strand == "-1":
							typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(end)+"_"+str(start)]
						else:
							typeURI = coreURI[genome + "/" + typ + "/" + feature_type+"/gbk/"+str(start)+"_"+str(end)]

						#Contig specific connection						
						genomeGraph.add((gbkURI, coreURI["feature"] , typeURI))
						############################

						store_general_information(typeURI,feature,record)

						for subfeature in feature.sub_features:
							strand = str(subfeature.location.strand)
							subfeature_type = subfeature.type
							end = str(subfeature.location.end).replace(">","").replace("<","")
							start = str(subfeature.location.start).replace(">","").replace("<","")

							if strand == "-1":
								subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(end)+"_"+str(start)]
							else:
								subURI = coreURI[genome + "/" + typ + "/" + subfeature_type+"/gbk/"+str(start)+"_"+str(end)]
							genomeGraph.add((typeURI, coreURI["feature"] , subURI))
							store_general_information(subURI,subfeature,record,feature)


def store_general_information(generalURI,feature,record,superfeature=""):
	proteinClass = createClass(coreURI["Protein"], root=True)
	sequence = str(record.seq)
	cds_sequence = str(feature.extract(sequence))
	#Fixes the 0 count instead of 1-count in biopython vs humans
	feature_type = feature.type
	end = str(feature.location.end).replace(">","").replace("<","")
	start = str(feature.location.start).replace(">","").replace("<","")
	strand = str(feature.location.strand)	
	if strand == "None":
		strand = 0

	genomeGraph.add((generalURI,coreURI["sourcedb"],Literal(sys.argv[sys.argv.index("-sourcedb")+1])))

	if strand == "-1":
		genomeGraph.add((generalURI,coreURI["end"],Literal(int(start)+1)))
		genomeGraph.add((generalURI,coreURI["begin"],Literal(int(end))))
	else:
		genomeGraph.add((generalURI,coreURI["begin"],Literal(int(start)+1)))
		genomeGraph.add((generalURI,coreURI["end"],Literal(int(end))))	
	genomeGraph.add((generalURI,coreURI["strand"],Literal(int(strand))))
	if feature.type != "misc_feature":
		try:
			genomeGraph.add((generalURI,coreURI["sequence"],Literal(cds_sequence)))
		except: #When protein sequence is not given for whatever reason
			print ("wrong?")

	if feature.type == "misc_feature":
		pass
	else:
		genomeGraph.add((generalURI,RDF.type,createClass(coreURI[feature_type.lower().title()], root=False)))
		if feature_type.lower() != "rrna" and feature_type.lower() != "trna" and feature_type.lower() != "tmrna" and feature_type.lower() != "ncrna":
			SubClassOfDict[feature_type.lower().title()] = 1
	for key in feature.qualifiers:
		values = feature.qualifiers[key]
		if key == "translation":
			pass
		elif type(values) == list:
			for v in values:
				int_add(generalURI,coreURI[key.lower()],v)
		else:
			int_add(generalURI,coreURI[key.lower()],values)
	if feature.type == "CDS":
		try:
			#Feature is normally submitted to this function
			#IF a subfeature is submitted it is submitted as a feature
			#And subfeature variable will contain the superfeature
			if superfeature:
				codon = superfeature.qualifiers["transl_table"][0]
		except:
			#Default codon table 11
			codon = "11"
		#Protein linkage
		translation = ""
		try:
			translation = feature.qualifiers["translation"][0].strip("*")
		except KeyError:
			#When protein sequence is not given...
			if len(feature.location.parts) > 1:
				#Exon boundaries?
				seq = ''
				for loc in feature.location:
					seq += record.seq[loc]
				if int(feature.location.strand) == -1:
					seq = Seq(seq).complement()
				else:
					seq = Seq(seq)
				translation = str(seq.translate(feature.qualifiers["transl_table"][0]))
			elif int(feature.location.strand) == -1:
				if str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon)).strip("*") != translation:
					if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:
						translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].reverse_complement().translate(codon))
					else:
						translation = ''
			elif int(feature.location.strand) == +1:
					if len(str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end])) % 3 == 0:
						translation = str(record.seq[feature.location.nofuzzy_start:feature.location.nofuzzy_end].translate(codon))
					else:
						translation = ''
			
			if translation:
				translation = list(translation)
				translation[0] = "M"
				translation = ''.join(translation).strip("*")
				if "*" in translation:
					pass		

		translation = translation.encode('utf-8')
		md5_protein = hashlib.md5(translation).hexdigest()
		proteinURI = coreURI["protein/"+md5_protein]
		genomeGraph.add((generalURI,coreURI["protein"],proteinURI))
		for key in feature.qualifiers:
			for v in feature.qualifiers[key]:
				if key == "translation":
					genomeGraph.add((proteinURI,coreURI["md5"],Literal(md5_protein)))
					genomeGraph.add((proteinURI,coreURI["sequence"],Literal(translation)))
					genomeGraph.add((proteinURI,RDF.type,proteinClass))
				else:
					for v in feature.qualifiers[key]:
						int_add(generalURI,coreURI[key.lower()],v)
	
def int_add(subject, predicate, obj):
	try:
		object_float = float(obj.replace('"',''))
		object_int = int(obj.replace('"',''))
		if object_int == object_float:
			genomeGraph.add((subject,predicate,Literal(object_int)))
		else:
			genomeGraph.add((subject,predicate,Literal(object_float)))
	except:
		genomeGraph.add((subject,predicate,Literal(obj.replace('"',''))))
				
def save():
	data = genomeGraph.serialize(format='turtle')
	open(sys.argv[sys.argv.index("-output")+1],"wb").write(data)

def subClassOfBuilder():
	for subclass in SubClassOfDict:
		genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))
		genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Feature"]))

def subClassOfBuilderRna():
	for subclass in SubClassOfDictRna:
		genomeGraph.add((coreURI["Feature"],RDFS.subClassOf,OWL.Thing))
		genomeGraph.add((coreURI["Rna"],RDFS.subClassOf,coreURI["Feature"]))
		genomeGraph.add((coreURI[subclass],RDFS.subClassOf,coreURI["Rna"]))
		genomeGraph.add((coreURI[subclass],RDF.type,OWL.Class))

def main():
	tmp()
	gbk_parser()
	subClassOfBuilder()
	subClassOfBuilderRna()
	save()
	cleantmp()

if __name__ == "__main__":
	main()