diff SMART/Java/Python/GetFlanking.py @ 46:169d364ddd91

Uploaded
author m-zytnicki
date Mon, 30 Sep 2013 03:19:26 -0400
parents 44d5973c188c
children
line wrap: on
line diff
--- a/SMART/Java/Python/GetFlanking.py	Wed Sep 18 08:51:22 2013 -0400
+++ b/SMART/Java/Python/GetFlanking.py	Mon Sep 30 03:19:26 2013 -0400
@@ -44,190 +44,188 @@
 TAG_REGION   = "_region"
 TAGS_REGION  = {-1: "_upstream", 0: "", 1: "_downstream"}
 TAGS_RREGION = {-1: "upstream", 0: "overlapping", 1: "downstream"}
-TAGS_SENSE   = {-1: "antisense", 0: "", 1: "collinear"}
+TAGS_SENSE   = {-1: "antisense", 0: "", 1: "colinear"}
 STRANDSTOSTR = {-1: "(-)", 0: "", 1: "(+)"}
 
 
-def getOrderKey(transcript, direction, input):
-	if direction == 1:
-		if input == QUERY:
-			return (transcript.getEnd(), -transcript.getStart())
-		return (transcript.getStart(), -transcript.getEnd())
-	if input == QUERY:
-		return (-transcript.getStart(), transcript.getEnd())
-	return (-transcript.getEnd(), transcript.getStart())
+def getOrderKey(transcript, direction):
+    if direction == 1:
+        return transcript.getEnd()
+    return - transcript.getStart()
+
+def isInGoodRegion(transcriptRef, transcriptQuery, direction):
+    if direction == 1:
+        return transcriptQuery.getEnd() > transcriptRef.getEnd()
+    return transcriptQuery.getStart() < transcriptRef.getStart()
 
 
 class GetFlanking(object):
 
-	def __init__(self, verbosity):
-		self.verbosity   = verbosity
-		self.transcripts = dict([id, {}] for id in INPUTS)
-		self.directions  = []
-		self.noOverlap   = False
-		self.colinear    = False
-		self.antisense   = False
-		self.distance    = None
-		self.minDistance = None
-		self.maxDistance = None
-		self.tagName     = "flanking"
+    def __init__(self, verbosity):
+        self.verbosity   = verbosity
+        self.transcripts = dict([id, {}] for id in INPUTS)
+        self.directions  = []
+        self.noOverlap   = False
+        self.colinear    = False
+        self.antisense   = False
+        self.distance    = None
+        self.minDistance = None
+        self.maxDistance = None
+        self.tagName     = "flanking"
 
-	def setInputFile(self, fileName, format, id):
-		chooser = ParserChooser(self.verbosity)
-		chooser.findFormat(format)
-		parser = chooser.getParser(fileName)
-		for transcript in parser.getIterator():
-			chromosome = transcript.getChromosome()
-			if chromosome not in self.transcripts[id]:
-				self.transcripts[id][chromosome] = []
-			self.transcripts[id][chromosome].append(transcript)
+    def setInputFile(self, fileName, format, id):
+        chooser = ParserChooser(self.verbosity)
+        chooser.findFormat(format)
+        parser = chooser.getParser(fileName)
+        for transcript in parser.getIterator():
+            chromosome = transcript.getChromosome()
+            if chromosome not in self.transcripts[id]:
+                self.transcripts[id][chromosome] = []
+            self.transcripts[id][chromosome].append(transcript)
 
-	def setOutputFile(self, fileName):
-		self.writer = TranscriptWriter(fileName, "gff3", self.verbosity)
+    def setOutputFile(self, fileName):
+        self.writer = TranscriptWriter(fileName, "gff3", self.verbosity)
+
+    def addUpstreamDirection(self, upstream):
+        if upstream:
+            self.directions.append(-1)
+
+    def addDownstreamDirection(self, downstream):
+        if downstream:
+            self.directions.append(1)
 
-	def addUpstreamDirection(self, upstream):
-		if upstream:
-			self.directions.append(-1)
+    def setColinear(self, colinear):
+        self.colinear = colinear
+
+    def setAntisense(self, antisense):
+        self.antisense = antisense
 
-	def addDownstreamDirection(self, downstream):
-		if downstream:
-			self.directions.append(1)
+    def setNoOverlap(self, noOverlap):
+        self.noOverlap = noOverlap
 
-	def setColinear(self, colinear):
-		self.colinear = colinear
+    def setMinDistance(self, distance):
+        self.minDistance = distance
+
+    def setMaxDistance(self, distance):
+        self.maxDistance = distance
 
-	def setAntisense(self, antisense):
-		self.antisense = antisense
-
-	def setNoOverlap(self, noOverlap):
-		self.noOverlap = noOverlap
+    def setNewTagName(self, tagName):
+        self.tagName = tagName
 
-	def setMinDistance(self, distance):
-		self.minDistance = distance
-
-	def setMaxDistance(self, distance):
-		self.maxDistance = distance
-
-	def setNewTagName(self, tagName):
-		self.tagName = tagName
+    def match(self, transcriptRef, transcriptQuery, direction):
+        if self.noOverlap and transcriptRef.overlapWith(transcriptQuery):
+            return False
+        if self.colinear and transcriptRef.getDirection() != transcriptQuery.getDirection():
+            return False
+        if self.antisense and transcriptRef.getDirection() == transcriptQuery.getDirection():
+            return False
+        if self.minDistance != None or self.maxDistance != None:
+            distance = transcriptRef.getDistance(transcriptQuery)
+            if self.minDistance != None and distance < self.minDistance:
+                return False
+            if self.maxDistance != None and distance > self.maxDistance:
+                return False
+        return True
 
-	def match(self, transcriptQuery, transcriptRef, direction):
-		#print "comparing", transcriptQuery, "with", transcriptRef, "on direction", direction
-		if direction == 1 and transcriptRef.getEnd() < transcriptQuery.getStart():
-			return False
-		if direction == -1 and transcriptQuery.getEnd() < transcriptRef.getStart():
-			return False
-		if self.noOverlap and transcriptRef.overlapWith(transcriptQuery):
-			return False
-		if self.colinear and transcriptRef.getDirection() != transcriptQuery.getDirection():
-			return False
-		if self.antisense and transcriptRef.getDirection() == transcriptQuery.getDirection():
-			return False
-		if self.minDistance != None or self.maxDistance != None:
-			distance = transcriptRef.getDistance(transcriptQuery)
-			if self.minDistance != None and distance < self.minDistance:
-				return False
-			if self.maxDistance != None and distance > self.maxDistance:
-				return False
-		return True
-
-	def getFlanking(self, chromosome, direction):
-		if chromosome not in self.transcripts[REFERENCE]:
-			return
-		sortedTranscripts = dict([id, {}] for id in INPUTS)
-		for id in INPUTS:
-			sortedTranscripts[id] = sorted(self.transcripts[id][chromosome], key = lambda t: getOrderKey(t, direction, id))
-		refIndex = 0
-		progress = Progress(len(sortedTranscripts[QUERY]), "Reading chr %s %s" % (chromosome, STRANDSTOSTR[direction]), self.verbosity)
-		for query in sortedTranscripts[QUERY]:
-			#print "Q: ", query
-			#print "R1: ", sortedTranscripts[REFERENCE][refIndex]
-			while not self.match(query, sortedTranscripts[REFERENCE][refIndex], direction):
-				refIndex += 1
-				if refIndex == len(sortedTranscripts[REFERENCE]):
-					progress.done()
-					#print "done"
-					return
-				#print "R2: ", sortedTranscripts[REFERENCE][refIndex]
-			self.flankings[query][direction] = sortedTranscripts[REFERENCE][refIndex]
-			progress.inc()
-		progress.done()
+    def getFlanking(self, direction):
+        for chromosome in sorted(self.transcripts[REFERENCE].keys()):
+            if chromosome not in self.transcripts[QUERY]:
+                continue
+            sortedTranscripts = dict([id, {}] for id in INPUTS)
+            for id in INPUTS:
+                sortedTranscripts[id] = sorted(self.transcripts[id][chromosome], key = lambda t: getOrderKey(t, direction))
+            refIndex    = 0
+            currentRefs = []
+            outputs     = set()
+            progress    = Progress(len(sortedTranscripts[QUERY]), "Reading chr %s %s" % (chromosome, STRANDSTOSTR[direction]), self.verbosity)
+            for query in sortedTranscripts[QUERY]:
+                while refIndex < len(sortedTranscripts[REFERENCE]) and isInGoodRegion(sortedTranscripts[REFERENCE][refIndex], query, direction):
+                    currentRefs.append(sortedTranscripts[REFERENCE][refIndex])
+                    refIndex += 1
+                nextCurrentRefs = []
+                for currentRef in currentRefs:
+                    if self.match(currentRef, query, direction):
+                        if currentRef not in self.flankings:
+                            self.flankings[currentRef] = {}
+                        self.flankings[currentRef][direction * currentRef.getDirection()] = query
+                    else:
+                        nextCurrentRefs.append(currentRef)
+                currentRefs = nextCurrentRefs
+                progress.inc()
+            progress.done()
 
-	def setTags(self, query, reference, direction):
-		refName = reference.getTagValue("ID")
-		if refName == None:
-			refName = reference.getName()
-		if refName == None:
-			refName = reference.__str__()
-		query.setTagValue("%s%s" % (self.tagName, TAGS_REGION[direction*query.getDirection()]), refName)
-		query.setTagValue("%s_%s%s" % (TAG_DISTANCE, self.tagName, TAGS_REGION[direction*query.getDirection()]), query.getDistance(reference))
-		query.setTagValue("%s_%s" % (TAG_SENSE, self.tagName), TAGS_SENSE[query.getDirection() * reference.getDirection()])
-		if direction == 0:
-			query.setTagValue("%s_%s" % (TAG_REGION, self.tagName), TAGS_RREGION[cmp(query.getRelativeDistance(reference), 0)])
-		for tag in reference.getTagNames():
-			if tag not in ("quality", "feature"):
-				query.setTagValue("%s%s_%s" % (self.tagName, TAGS_REGION[direction*query.getDirection()], tag), reference.getTagValue(tag))
-		return query
+    def setTags(self, query, reference, direction):
+        refName = reference.getTagValue("ID")
+        if refName == None:
+            refName = reference.getName()
+        if refName == None:
+            refName = reference.__str__()
+        query.setTagValue("%s%s" % (self.tagName, TAGS_REGION[direction]), refName)
+        query.setTagValue("%s_%s%s" % (TAG_DISTANCE, self.tagName, TAGS_REGION[direction]), query.getDistance(reference))
+        if direction == 0:
+            query.setTagValue("%s_%s" % (TAG_SENSE, self.tagName), TAGS_SENSE[query.getDirection() * reference.getDirection()])
+            query.setTagValue("%s_%s" % (TAG_REGION, self.tagName), TAGS_RREGION[cmp(query.getRelativeDistance(reference), 0)])
+        for tag in reference.getTagNames():
+            if tag not in ("quality", "feature"):
+                query.setTagValue("%s%s_%s" % (self.tagName, TAGS_REGION[direction], tag), reference.getTagValue(tag))
+        return query
 
-	def write(self):
-		progress = Progress(len(self.flankings.keys()), "Printing data", self.verbosity)
-		for transcriptQuery in self.flankings.keys():
-			if not self.flankings[transcriptQuery]:
-				self.writer.addTranscript(transcriptQuery)
-			elif self.directions:
-				for direction in self.directions:
-					#relativeDirection = direction if transcriptQuery.getDirection() == 1 else - direction
-					relativeDirection = direction * transcriptQuery.getDirection()
-					if relativeDirection in self.flankings[transcriptQuery]:
-						transcriptRef = self.flankings[transcriptQuery][relativeDirection]
-						transcriptQuery = self.setTags(transcriptQuery, transcriptRef, relativeDirection)
-				self.writer.addTranscript(transcriptQuery)
-			else:
-				transcriptRef = sorted(self.flankings[transcriptQuery].values(), key = lambda transcriptRef: transcriptQuery.getDistance(transcriptRef))[0]
-				self.writer.addTranscript(self.setTags(transcriptQuery, transcriptRef, 0))
-			progress.inc()
-		progress.done()
+    def write(self):
+        outputs   = set()
+        progress  = Progress(len(self.flankings.keys()), "Printing data", self.verbosity)
+        for transcriptRef in self.flankings.keys():
+            if self.directions:
+                for direction in self.directions:
+                    if direction in self.flankings[transcriptRef]:
+                        query = self.flankings[transcriptRef][direction]
+                        outputs.add(self.setTags(query, transcriptRef, direction))
+            else:
+                if self.flankings[transcriptRef]:
+                    query = sorted(self.flankings[transcriptRef].values(), key = lambda query: query.getDistance(transcriptRef))[0]
+                    outputs.add(self.setTags(query, transcriptRef, 0))
+            progress.inc()
+        for transcript in sorted(list(outputs), key = lambda flanking: (flanking.getChromosome(), flanking.getStart(), flanking.getEnd())):
+            self.writer.addTranscript(transcript)
+        self.writer.close()
+        progress.done()
 
-	def run(self):
-		for chromosome in sorted(self.transcripts[QUERY].keys()):
-			self.flankings = dict([query, {}] for query in self.transcripts[QUERY][chromosome])
-			for direction in STRANDS:
-				#print "comparison", chromosome, direction
-				self.getFlanking(chromosome, direction)
-			self.write()
-		self.writer.close()
+    def run(self):
+        self.flankings = {}
+        for direction in STRANDS:
+            self.getFlanking(direction)
+        self.write()
 
 if __name__ == "__main__":
-	
-	description = "Get Flanking v1.0.1: Get the flanking regions of a set of reference. [Category: Data Selection]"
+    
+    description = "Get Flanking v1.0.1: Get the flanking regions of a set of reference. [Category: Data Selection]"
 
-	parser = OptionParser(description = description)
-	parser.add_option("-i", "--input1",      dest="inputFileName1", action="store",                          type="string", help="query input file [compulsory] [format: file in transcript format given by -f]")
-	parser.add_option("-f", "--format1",     dest="format1",        action="store",                          type="string", help="format of previous file [compulsory] [format: transcript file format]")
-	parser.add_option("-j", "--input2",      dest="inputFileName2", action="store",                          type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]")
-	parser.add_option("-g", "--format2",     dest="format2",        action="store",                          type="string", help="format of previous file [compulsory] [format: transcript file format]")
-	parser.add_option("-5", "--upstream",    dest="upstream",       action="store_true", default=False,                     help="output upstream elements [format: boolean] [default: False]")
-	parser.add_option("-3", "--downstream",  dest="downstream",     action="store_true", default=False,                     help="output downstream elements [format: boolean] [default: False]")
-	parser.add_option("-c", "--colinear",    dest="colinear",       action="store_true", default=False,                     help="find first colinear element [format: boolean] [default: False]")
-	parser.add_option("-a", "--antisense",   dest="antisense",      action="store_true", default=False,                     help="find first anti-sense element [format: boolean] [default: False]")
-	parser.add_option("-e", "--noOverlap",   dest="noOverlap",      action="store_true", default=False,                     help="do not consider elements which are overlapping reference elements [format: boolean] [default: False]")
-	parser.add_option("-d", "--minDistance", dest="minDistance",    action="store",      default=None,       type="int",    help="minimum distance between 2 elements [format: int]")
-	parser.add_option("-D", "--maxDistance", dest="maxDistance",    action="store",      default=None,       type="int",    help="maximum distance between 2 elements [format: int]")
-	parser.add_option("-t", "--tag",         dest="tagName",        action="store",      default="flanking", type="string", help="name of the new tag [format: string] [default: flanking]")
-	parser.add_option("-o", "--output",      dest="outputFileName", action="store",                          type="string", help="output file [format: output file in GFF3 format]")
-	parser.add_option("-v", "--verbosity",   dest="verbosity",      action="store",      default=1,          type="int",    help="trace level [format: int]")
-	(options, args) = parser.parse_args()
+    parser = OptionParser(description = description)
+    parser.add_option("-i", "--input1",      dest="inputFileName1", action="store",                          type="string", help="query input file [compulsory] [format: file in transcript format given by -f]")
+    parser.add_option("-f", "--format1",     dest="format1",        action="store",                          type="string", help="format of previous file [compulsory] [format: transcript file format]")
+    parser.add_option("-j", "--input2",      dest="inputFileName2", action="store",                          type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]")
+    parser.add_option("-g", "--format2",     dest="format2",        action="store",                          type="string", help="format of previous file [compulsory] [format: transcript file format]")
+    parser.add_option("-5", "--upstream",    dest="upstream",       action="store_true", default=False,                     help="output upstream elements [format: boolean] [default: False]")
+    parser.add_option("-3", "--downstream",  dest="downstream",     action="store_true", default=False,                     help="output downstream elements [format: boolean] [default: False]")
+    parser.add_option("-c", "--colinear",    dest="colinear",       action="store_true", default=False,                     help="find first colinear element [format: boolean] [default: False]")
+    parser.add_option("-a", "--antisense",   dest="antisense",      action="store_true", default=False,                     help="find first anti-sense element [format: boolean] [default: False]")
+    parser.add_option("-e", "--noOverlap",   dest="noOverlap",      action="store_true", default=False,                     help="do not consider elements which are overlapping reference elements [format: boolean] [default: False]")
+    parser.add_option("-d", "--minDistance", dest="minDistance",    action="store",      default=None,       type="int",    help="minimum distance between 2 elements [format: int]")
+    parser.add_option("-D", "--maxDistance", dest="maxDistance",    action="store",      default=None,       type="int",    help="maximum distance between 2 elements [format: int]")
+    parser.add_option("-t", "--tag",         dest="tagName",        action="store",      default="flanking", type="string", help="name of the new tag [format: string] [default: flanking]")
+    parser.add_option("-o", "--output",      dest="outputFileName", action="store",                          type="string", help="output file [format: output file in GFF3 format]")
+    parser.add_option("-v", "--verbosity",   dest="verbosity",      action="store",      default=1,          type="int",    help="trace level [format: int]")
+    (options, args) = parser.parse_args()
 
-	gf = GetFlanking(options.verbosity)
-	gf.setInputFile(options.inputFileName1, options.format1, QUERY)
-	gf.setInputFile(options.inputFileName2, options.format2, REFERENCE)
-	gf.setOutputFile(options.outputFileName)
-	gf.addUpstreamDirection(options.upstream)
-	gf.addDownstreamDirection(options.downstream)
-	gf.setColinear(options.colinear)
-	gf.setAntisense(options.antisense)
-	gf.setNoOverlap(options.noOverlap)
-	gf.setMinDistance(options.minDistance)
-	gf.setMaxDistance(options.maxDistance)
-	gf.setNewTagName(options.tagName)
-	gf.run()
+    gf = GetFlanking(options.verbosity)
+    gf.setInputFile(options.inputFileName1, options.format1, QUERY)
+    gf.setInputFile(options.inputFileName2, options.format2, REFERENCE)
+    gf.setOutputFile(options.outputFileName)
+    gf.addUpstreamDirection(options.upstream)
+    gf.addDownstreamDirection(options.downstream)
+    gf.setColinear(options.colinear)
+    gf.setAntisense(options.antisense)
+    gf.setNoOverlap(options.noOverlap)
+    gf.setMinDistance(options.minDistance)
+    gf.setMaxDistance(options.maxDistance)
+    gf.setNewTagName(options.tagName)
+    gf.run()