comparison SMART/Java/Python/GetFlanking.py @ 46:169d364ddd91

Uploaded
author m-zytnicki
date Mon, 30 Sep 2013 03:19:26 -0400
parents 44d5973c188c
children
comparison
equal deleted inserted replaced
45:e454402ba9d9 46:169d364ddd91
42 TAG_DISTANCE = "distance_" 42 TAG_DISTANCE = "distance_"
43 TAG_SENSE = "_sense" 43 TAG_SENSE = "_sense"
44 TAG_REGION = "_region" 44 TAG_REGION = "_region"
45 TAGS_REGION = {-1: "_upstream", 0: "", 1: "_downstream"} 45 TAGS_REGION = {-1: "_upstream", 0: "", 1: "_downstream"}
46 TAGS_RREGION = {-1: "upstream", 0: "overlapping", 1: "downstream"} 46 TAGS_RREGION = {-1: "upstream", 0: "overlapping", 1: "downstream"}
47 TAGS_SENSE = {-1: "antisense", 0: "", 1: "collinear"} 47 TAGS_SENSE = {-1: "antisense", 0: "", 1: "colinear"}
48 STRANDSTOSTR = {-1: "(-)", 0: "", 1: "(+)"} 48 STRANDSTOSTR = {-1: "(-)", 0: "", 1: "(+)"}
49 49
50 50
51 def getOrderKey(transcript, direction, input): 51 def getOrderKey(transcript, direction):
52 if direction == 1: 52 if direction == 1:
53 if input == QUERY: 53 return transcript.getEnd()
54 return (transcript.getEnd(), -transcript.getStart()) 54 return - transcript.getStart()
55 return (transcript.getStart(), -transcript.getEnd()) 55
56 if input == QUERY: 56 def isInGoodRegion(transcriptRef, transcriptQuery, direction):
57 return (-transcript.getStart(), transcript.getEnd()) 57 if direction == 1:
58 return (-transcript.getEnd(), transcript.getStart()) 58 return transcriptQuery.getEnd() > transcriptRef.getEnd()
59 return transcriptQuery.getStart() < transcriptRef.getStart()
59 60
60 61
61 class GetFlanking(object): 62 class GetFlanking(object):
62 63
63 def __init__(self, verbosity): 64 def __init__(self, verbosity):
64 self.verbosity = verbosity 65 self.verbosity = verbosity
65 self.transcripts = dict([id, {}] for id in INPUTS) 66 self.transcripts = dict([id, {}] for id in INPUTS)
66 self.directions = [] 67 self.directions = []
67 self.noOverlap = False 68 self.noOverlap = False
68 self.colinear = False 69 self.colinear = False
69 self.antisense = False 70 self.antisense = False
70 self.distance = None 71 self.distance = None
71 self.minDistance = None 72 self.minDistance = None
72 self.maxDistance = None 73 self.maxDistance = None
73 self.tagName = "flanking" 74 self.tagName = "flanking"
74 75
75 def setInputFile(self, fileName, format, id): 76 def setInputFile(self, fileName, format, id):
76 chooser = ParserChooser(self.verbosity) 77 chooser = ParserChooser(self.verbosity)
77 chooser.findFormat(format) 78 chooser.findFormat(format)
78 parser = chooser.getParser(fileName) 79 parser = chooser.getParser(fileName)
79 for transcript in parser.getIterator(): 80 for transcript in parser.getIterator():
80 chromosome = transcript.getChromosome() 81 chromosome = transcript.getChromosome()
81 if chromosome not in self.transcripts[id]: 82 if chromosome not in self.transcripts[id]:
82 self.transcripts[id][chromosome] = [] 83 self.transcripts[id][chromosome] = []
83 self.transcripts[id][chromosome].append(transcript) 84 self.transcripts[id][chromosome].append(transcript)
84 85
85 def setOutputFile(self, fileName): 86 def setOutputFile(self, fileName):
86 self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) 87 self.writer = TranscriptWriter(fileName, "gff3", self.verbosity)
87 88
88 def addUpstreamDirection(self, upstream): 89 def addUpstreamDirection(self, upstream):
89 if upstream: 90 if upstream:
90 self.directions.append(-1) 91 self.directions.append(-1)
91 92
92 def addDownstreamDirection(self, downstream): 93 def addDownstreamDirection(self, downstream):
93 if downstream: 94 if downstream:
94 self.directions.append(1) 95 self.directions.append(1)
95 96
96 def setColinear(self, colinear): 97 def setColinear(self, colinear):
97 self.colinear = colinear 98 self.colinear = colinear
98 99
99 def setAntisense(self, antisense): 100 def setAntisense(self, antisense):
100 self.antisense = antisense 101 self.antisense = antisense
101 102
102 def setNoOverlap(self, noOverlap): 103 def setNoOverlap(self, noOverlap):
103 self.noOverlap = noOverlap 104 self.noOverlap = noOverlap
104 105
105 def setMinDistance(self, distance): 106 def setMinDistance(self, distance):
106 self.minDistance = distance 107 self.minDistance = distance
107 108
108 def setMaxDistance(self, distance): 109 def setMaxDistance(self, distance):
109 self.maxDistance = distance 110 self.maxDistance = distance
110 111
111 def setNewTagName(self, tagName): 112 def setNewTagName(self, tagName):
112 self.tagName = tagName 113 self.tagName = tagName
113 114
114 def match(self, transcriptQuery, transcriptRef, direction): 115 def match(self, transcriptRef, transcriptQuery, direction):
115 #print "comparing", transcriptQuery, "with", transcriptRef, "on direction", direction 116 if self.noOverlap and transcriptRef.overlapWith(transcriptQuery):
116 if direction == 1 and transcriptRef.getEnd() < transcriptQuery.getStart(): 117 return False
117 return False 118 if self.colinear and transcriptRef.getDirection() != transcriptQuery.getDirection():
118 if direction == -1 and transcriptQuery.getEnd() < transcriptRef.getStart(): 119 return False
119 return False 120 if self.antisense and transcriptRef.getDirection() == transcriptQuery.getDirection():
120 if self.noOverlap and transcriptRef.overlapWith(transcriptQuery): 121 return False
121 return False 122 if self.minDistance != None or self.maxDistance != None:
122 if self.colinear and transcriptRef.getDirection() != transcriptQuery.getDirection(): 123 distance = transcriptRef.getDistance(transcriptQuery)
123 return False 124 if self.minDistance != None and distance < self.minDistance:
124 if self.antisense and transcriptRef.getDirection() == transcriptQuery.getDirection(): 125 return False
125 return False 126 if self.maxDistance != None and distance > self.maxDistance:
126 if self.minDistance != None or self.maxDistance != None: 127 return False
127 distance = transcriptRef.getDistance(transcriptQuery) 128 return True
128 if self.minDistance != None and distance < self.minDistance: 129
129 return False 130 def getFlanking(self, direction):
130 if self.maxDistance != None and distance > self.maxDistance: 131 for chromosome in sorted(self.transcripts[REFERENCE].keys()):
131 return False 132 if chromosome not in self.transcripts[QUERY]:
132 return True 133 continue
133 134 sortedTranscripts = dict([id, {}] for id in INPUTS)
134 def getFlanking(self, chromosome, direction): 135 for id in INPUTS:
135 if chromosome not in self.transcripts[REFERENCE]: 136 sortedTranscripts[id] = sorted(self.transcripts[id][chromosome], key = lambda t: getOrderKey(t, direction))
136 return 137 refIndex = 0
137 sortedTranscripts = dict([id, {}] for id in INPUTS) 138 currentRefs = []
138 for id in INPUTS: 139 outputs = set()
139 sortedTranscripts[id] = sorted(self.transcripts[id][chromosome], key = lambda t: getOrderKey(t, direction, id)) 140 progress = Progress(len(sortedTranscripts[QUERY]), "Reading chr %s %s" % (chromosome, STRANDSTOSTR[direction]), self.verbosity)
140 refIndex = 0 141 for query in sortedTranscripts[QUERY]:
141 progress = Progress(len(sortedTranscripts[QUERY]), "Reading chr %s %s" % (chromosome, STRANDSTOSTR[direction]), self.verbosity) 142 while refIndex < len(sortedTranscripts[REFERENCE]) and isInGoodRegion(sortedTranscripts[REFERENCE][refIndex], query, direction):
142 for query in sortedTranscripts[QUERY]: 143 currentRefs.append(sortedTranscripts[REFERENCE][refIndex])
143 #print "Q: ", query 144 refIndex += 1
144 #print "R1: ", sortedTranscripts[REFERENCE][refIndex] 145 nextCurrentRefs = []
145 while not self.match(query, sortedTranscripts[REFERENCE][refIndex], direction): 146 for currentRef in currentRefs:
146 refIndex += 1 147 if self.match(currentRef, query, direction):
147 if refIndex == len(sortedTranscripts[REFERENCE]): 148 if currentRef not in self.flankings:
148 progress.done() 149 self.flankings[currentRef] = {}
149 #print "done" 150 self.flankings[currentRef][direction * currentRef.getDirection()] = query
150 return 151 else:
151 #print "R2: ", sortedTranscripts[REFERENCE][refIndex] 152 nextCurrentRefs.append(currentRef)
152 self.flankings[query][direction] = sortedTranscripts[REFERENCE][refIndex] 153 currentRefs = nextCurrentRefs
153 progress.inc() 154 progress.inc()
154 progress.done() 155 progress.done()
155 156
156 def setTags(self, query, reference, direction): 157 def setTags(self, query, reference, direction):
157 refName = reference.getTagValue("ID") 158 refName = reference.getTagValue("ID")
158 if refName == None: 159 if refName == None:
159 refName = reference.getName() 160 refName = reference.getName()
160 if refName == None: 161 if refName == None:
161 refName = reference.__str__() 162 refName = reference.__str__()
162 query.setTagValue("%s%s" % (self.tagName, TAGS_REGION[direction*query.getDirection()]), refName) 163 query.setTagValue("%s%s" % (self.tagName, TAGS_REGION[direction]), refName)
163 query.setTagValue("%s_%s%s" % (TAG_DISTANCE, self.tagName, TAGS_REGION[direction*query.getDirection()]), query.getDistance(reference)) 164 query.setTagValue("%s_%s%s" % (TAG_DISTANCE, self.tagName, TAGS_REGION[direction]), query.getDistance(reference))
164 query.setTagValue("%s_%s" % (TAG_SENSE, self.tagName), TAGS_SENSE[query.getDirection() * reference.getDirection()]) 165 if direction == 0:
165 if direction == 0: 166 query.setTagValue("%s_%s" % (TAG_SENSE, self.tagName), TAGS_SENSE[query.getDirection() * reference.getDirection()])
166 query.setTagValue("%s_%s" % (TAG_REGION, self.tagName), TAGS_RREGION[cmp(query.getRelativeDistance(reference), 0)]) 167 query.setTagValue("%s_%s" % (TAG_REGION, self.tagName), TAGS_RREGION[cmp(query.getRelativeDistance(reference), 0)])
167 for tag in reference.getTagNames(): 168 for tag in reference.getTagNames():
168 if tag not in ("quality", "feature"): 169 if tag not in ("quality", "feature"):
169 query.setTagValue("%s%s_%s" % (self.tagName, TAGS_REGION[direction*query.getDirection()], tag), reference.getTagValue(tag)) 170 query.setTagValue("%s%s_%s" % (self.tagName, TAGS_REGION[direction], tag), reference.getTagValue(tag))
170 return query 171 return query
171 172
172 def write(self): 173 def write(self):
173 progress = Progress(len(self.flankings.keys()), "Printing data", self.verbosity) 174 outputs = set()
174 for transcriptQuery in self.flankings.keys(): 175 progress = Progress(len(self.flankings.keys()), "Printing data", self.verbosity)
175 if not self.flankings[transcriptQuery]: 176 for transcriptRef in self.flankings.keys():
176 self.writer.addTranscript(transcriptQuery) 177 if self.directions:
177 elif self.directions: 178 for direction in self.directions:
178 for direction in self.directions: 179 if direction in self.flankings[transcriptRef]:
179 #relativeDirection = direction if transcriptQuery.getDirection() == 1 else - direction 180 query = self.flankings[transcriptRef][direction]
180 relativeDirection = direction * transcriptQuery.getDirection() 181 outputs.add(self.setTags(query, transcriptRef, direction))
181 if relativeDirection in self.flankings[transcriptQuery]: 182 else:
182 transcriptRef = self.flankings[transcriptQuery][relativeDirection] 183 if self.flankings[transcriptRef]:
183 transcriptQuery = self.setTags(transcriptQuery, transcriptRef, relativeDirection) 184 query = sorted(self.flankings[transcriptRef].values(), key = lambda query: query.getDistance(transcriptRef))[0]
184 self.writer.addTranscript(transcriptQuery) 185 outputs.add(self.setTags(query, transcriptRef, 0))
185 else: 186 progress.inc()
186 transcriptRef = sorted(self.flankings[transcriptQuery].values(), key = lambda transcriptRef: transcriptQuery.getDistance(transcriptRef))[0] 187 for transcript in sorted(list(outputs), key = lambda flanking: (flanking.getChromosome(), flanking.getStart(), flanking.getEnd())):
187 self.writer.addTranscript(self.setTags(transcriptQuery, transcriptRef, 0)) 188 self.writer.addTranscript(transcript)
188 progress.inc() 189 self.writer.close()
189 progress.done() 190 progress.done()
190 191
191 def run(self): 192 def run(self):
192 for chromosome in sorted(self.transcripts[QUERY].keys()): 193 self.flankings = {}
193 self.flankings = dict([query, {}] for query in self.transcripts[QUERY][chromosome]) 194 for direction in STRANDS:
194 for direction in STRANDS: 195 self.getFlanking(direction)
195 #print "comparison", chromosome, direction 196 self.write()
196 self.getFlanking(chromosome, direction)
197 self.write()
198 self.writer.close()
199 197
200 if __name__ == "__main__": 198 if __name__ == "__main__":
201 199
202 description = "Get Flanking v1.0.1: Get the flanking regions of a set of reference. [Category: Data Selection]" 200 description = "Get Flanking v1.0.1: Get the flanking regions of a set of reference. [Category: Data Selection]"
203 201
204 parser = OptionParser(description = description) 202 parser = OptionParser(description = description)
205 parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]") 203 parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]")
206 parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") 204 parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]")
207 parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]") 205 parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]")
208 parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") 206 parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]")
209 parser.add_option("-5", "--upstream", dest="upstream", action="store_true", default=False, help="output upstream elements [format: boolean] [default: False]") 207 parser.add_option("-5", "--upstream", dest="upstream", action="store_true", default=False, help="output upstream elements [format: boolean] [default: False]")
210 parser.add_option("-3", "--downstream", dest="downstream", action="store_true", default=False, help="output downstream elements [format: boolean] [default: False]") 208 parser.add_option("-3", "--downstream", dest="downstream", action="store_true", default=False, help="output downstream elements [format: boolean] [default: False]")
211 parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="find first colinear element [format: boolean] [default: False]") 209 parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="find first colinear element [format: boolean] [default: False]")
212 parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="find first anti-sense element [format: boolean] [default: False]") 210 parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="find first anti-sense element [format: boolean] [default: False]")
213 parser.add_option("-e", "--noOverlap", dest="noOverlap", action="store_true", default=False, help="do not consider elements which are overlapping reference elements [format: boolean] [default: False]") 211 parser.add_option("-e", "--noOverlap", dest="noOverlap", action="store_true", default=False, help="do not consider elements which are overlapping reference elements [format: boolean] [default: False]")
214 parser.add_option("-d", "--minDistance", dest="minDistance", action="store", default=None, type="int", help="minimum distance between 2 elements [format: int]") 212 parser.add_option("-d", "--minDistance", dest="minDistance", action="store", default=None, type="int", help="minimum distance between 2 elements [format: int]")
215 parser.add_option("-D", "--maxDistance", dest="maxDistance", action="store", default=None, type="int", help="maximum distance between 2 elements [format: int]") 213 parser.add_option("-D", "--maxDistance", dest="maxDistance", action="store", default=None, type="int", help="maximum distance between 2 elements [format: int]")
216 parser.add_option("-t", "--tag", dest="tagName", action="store", default="flanking", type="string", help="name of the new tag [format: string] [default: flanking]") 214 parser.add_option("-t", "--tag", dest="tagName", action="store", default="flanking", type="string", help="name of the new tag [format: string] [default: flanking]")
217 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") 215 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]")
218 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") 216 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
219 (options, args) = parser.parse_args() 217 (options, args) = parser.parse_args()
220 218
221 gf = GetFlanking(options.verbosity) 219 gf = GetFlanking(options.verbosity)
222 gf.setInputFile(options.inputFileName1, options.format1, QUERY) 220 gf.setInputFile(options.inputFileName1, options.format1, QUERY)
223 gf.setInputFile(options.inputFileName2, options.format2, REFERENCE) 221 gf.setInputFile(options.inputFileName2, options.format2, REFERENCE)
224 gf.setOutputFile(options.outputFileName) 222 gf.setOutputFile(options.outputFileName)
225 gf.addUpstreamDirection(options.upstream) 223 gf.addUpstreamDirection(options.upstream)
226 gf.addDownstreamDirection(options.downstream) 224 gf.addDownstreamDirection(options.downstream)
227 gf.setColinear(options.colinear) 225 gf.setColinear(options.colinear)
228 gf.setAntisense(options.antisense) 226 gf.setAntisense(options.antisense)
229 gf.setNoOverlap(options.noOverlap) 227 gf.setNoOverlap(options.noOverlap)
230 gf.setMinDistance(options.minDistance) 228 gf.setMinDistance(options.minDistance)
231 gf.setMaxDistance(options.maxDistance) 229 gf.setMaxDistance(options.maxDistance)
232 gf.setNewTagName(options.tagName) 230 gf.setNewTagName(options.tagName)
233 gf.run() 231 gf.run()