comparison SMART/Java/Python/GetFlanking.py @ 36:44d5973c188c

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 15:02:29 -0400
parents
children 169d364ddd91
comparison
equal deleted inserted replaced
35:d94018ca4ada 36:44d5973c188c
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2011
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 from optparse import OptionParser
32 from commons.core.parsing.ParserChooser import ParserChooser
33 from commons.core.writer.TranscriptWriter import TranscriptWriter
34 from SMART.Java.Python.structure.Transcript import Transcript
35 from SMART.Java.Python.structure.Interval import Interval
36 from SMART.Java.Python.misc.Progress import Progress
37
38 QUERY = 0
39 REFERENCE = 1
40 INPUTS = (QUERY, REFERENCE)
41 STRANDS = (-1, 1)
42 TAG_DISTANCE = "distance_"
43 TAG_SENSE = "_sense"
44 TAG_REGION = "_region"
45 TAGS_REGION = {-1: "_upstream", 0: "", 1: "_downstream"}
46 TAGS_RREGION = {-1: "upstream", 0: "overlapping", 1: "downstream"}
47 TAGS_SENSE = {-1: "antisense", 0: "", 1: "collinear"}
48 STRANDSTOSTR = {-1: "(-)", 0: "", 1: "(+)"}
49
50
51 def getOrderKey(transcript, direction, input):
52 if direction == 1:
53 if input == QUERY:
54 return (transcript.getEnd(), -transcript.getStart())
55 return (transcript.getStart(), -transcript.getEnd())
56 if input == QUERY:
57 return (-transcript.getStart(), transcript.getEnd())
58 return (-transcript.getEnd(), transcript.getStart())
59
60
61 class GetFlanking(object):
62
63 def __init__(self, verbosity):
64 self.verbosity = verbosity
65 self.transcripts = dict([id, {}] for id in INPUTS)
66 self.directions = []
67 self.noOverlap = False
68 self.colinear = False
69 self.antisense = False
70 self.distance = None
71 self.minDistance = None
72 self.maxDistance = None
73 self.tagName = "flanking"
74
75 def setInputFile(self, fileName, format, id):
76 chooser = ParserChooser(self.verbosity)
77 chooser.findFormat(format)
78 parser = chooser.getParser(fileName)
79 for transcript in parser.getIterator():
80 chromosome = transcript.getChromosome()
81 if chromosome not in self.transcripts[id]:
82 self.transcripts[id][chromosome] = []
83 self.transcripts[id][chromosome].append(transcript)
84
85 def setOutputFile(self, fileName):
86 self.writer = TranscriptWriter(fileName, "gff3", self.verbosity)
87
88 def addUpstreamDirection(self, upstream):
89 if upstream:
90 self.directions.append(-1)
91
92 def addDownstreamDirection(self, downstream):
93 if downstream:
94 self.directions.append(1)
95
96 def setColinear(self, colinear):
97 self.colinear = colinear
98
99 def setAntisense(self, antisense):
100 self.antisense = antisense
101
102 def setNoOverlap(self, noOverlap):
103 self.noOverlap = noOverlap
104
105 def setMinDistance(self, distance):
106 self.minDistance = distance
107
108 def setMaxDistance(self, distance):
109 self.maxDistance = distance
110
111 def setNewTagName(self, tagName):
112 self.tagName = tagName
113
114 def match(self, transcriptQuery, transcriptRef, direction):
115 #print "comparing", transcriptQuery, "with", transcriptRef, "on direction", direction
116 if direction == 1 and transcriptRef.getEnd() < transcriptQuery.getStart():
117 return False
118 if direction == -1 and transcriptQuery.getEnd() < transcriptRef.getStart():
119 return False
120 if self.noOverlap and transcriptRef.overlapWith(transcriptQuery):
121 return False
122 if self.colinear and transcriptRef.getDirection() != transcriptQuery.getDirection():
123 return False
124 if self.antisense and transcriptRef.getDirection() == transcriptQuery.getDirection():
125 return False
126 if self.minDistance != None or self.maxDistance != None:
127 distance = transcriptRef.getDistance(transcriptQuery)
128 if self.minDistance != None and distance < self.minDistance:
129 return False
130 if self.maxDistance != None and distance > self.maxDistance:
131 return False
132 return True
133
134 def getFlanking(self, chromosome, direction):
135 if chromosome not in self.transcripts[REFERENCE]:
136 return
137 sortedTranscripts = dict([id, {}] for id in INPUTS)
138 for id in INPUTS:
139 sortedTranscripts[id] = sorted(self.transcripts[id][chromosome], key = lambda t: getOrderKey(t, direction, id))
140 refIndex = 0
141 progress = Progress(len(sortedTranscripts[QUERY]), "Reading chr %s %s" % (chromosome, STRANDSTOSTR[direction]), self.verbosity)
142 for query in sortedTranscripts[QUERY]:
143 #print "Q: ", query
144 #print "R1: ", sortedTranscripts[REFERENCE][refIndex]
145 while not self.match(query, sortedTranscripts[REFERENCE][refIndex], direction):
146 refIndex += 1
147 if refIndex == len(sortedTranscripts[REFERENCE]):
148 progress.done()
149 #print "done"
150 return
151 #print "R2: ", sortedTranscripts[REFERENCE][refIndex]
152 self.flankings[query][direction] = sortedTranscripts[REFERENCE][refIndex]
153 progress.inc()
154 progress.done()
155
156 def setTags(self, query, reference, direction):
157 refName = reference.getTagValue("ID")
158 if refName == None:
159 refName = reference.getName()
160 if refName == None:
161 refName = reference.__str__()
162 query.setTagValue("%s%s" % (self.tagName, TAGS_REGION[direction*query.getDirection()]), refName)
163 query.setTagValue("%s_%s%s" % (TAG_DISTANCE, self.tagName, TAGS_REGION[direction*query.getDirection()]), query.getDistance(reference))
164 query.setTagValue("%s_%s" % (TAG_SENSE, self.tagName), TAGS_SENSE[query.getDirection() * reference.getDirection()])
165 if direction == 0:
166 query.setTagValue("%s_%s" % (TAG_REGION, self.tagName), TAGS_RREGION[cmp(query.getRelativeDistance(reference), 0)])
167 for tag in reference.getTagNames():
168 if tag not in ("quality", "feature"):
169 query.setTagValue("%s%s_%s" % (self.tagName, TAGS_REGION[direction*query.getDirection()], tag), reference.getTagValue(tag))
170 return query
171
172 def write(self):
173 progress = Progress(len(self.flankings.keys()), "Printing data", self.verbosity)
174 for transcriptQuery in self.flankings.keys():
175 if not self.flankings[transcriptQuery]:
176 self.writer.addTranscript(transcriptQuery)
177 elif self.directions:
178 for direction in self.directions:
179 #relativeDirection = direction if transcriptQuery.getDirection() == 1 else - direction
180 relativeDirection = direction * transcriptQuery.getDirection()
181 if relativeDirection in self.flankings[transcriptQuery]:
182 transcriptRef = self.flankings[transcriptQuery][relativeDirection]
183 transcriptQuery = self.setTags(transcriptQuery, transcriptRef, relativeDirection)
184 self.writer.addTranscript(transcriptQuery)
185 else:
186 transcriptRef = sorted(self.flankings[transcriptQuery].values(), key = lambda transcriptRef: transcriptQuery.getDistance(transcriptRef))[0]
187 self.writer.addTranscript(self.setTags(transcriptQuery, transcriptRef, 0))
188 progress.inc()
189 progress.done()
190
191 def run(self):
192 for chromosome in sorted(self.transcripts[QUERY].keys()):
193 self.flankings = dict([query, {}] for query in self.transcripts[QUERY][chromosome])
194 for direction in STRANDS:
195 #print "comparison", chromosome, direction
196 self.getFlanking(chromosome, direction)
197 self.write()
198 self.writer.close()
199
200 if __name__ == "__main__":
201
202 description = "Get Flanking v1.0.1: Get the flanking regions of a set of reference. [Category: Data Selection]"
203
204 parser = OptionParser(description = description)
205 parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]")
206 parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]")
207 parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]")
208 parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]")
209 parser.add_option("-5", "--upstream", dest="upstream", action="store_true", default=False, help="output upstream elements [format: boolean] [default: False]")
210 parser.add_option("-3", "--downstream", dest="downstream", action="store_true", default=False, help="output downstream elements [format: boolean] [default: False]")
211 parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="find first colinear element [format: boolean] [default: False]")
212 parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="find first anti-sense element [format: boolean] [default: False]")
213 parser.add_option("-e", "--noOverlap", dest="noOverlap", action="store_true", default=False, help="do not consider elements which are overlapping reference elements [format: boolean] [default: False]")
214 parser.add_option("-d", "--minDistance", dest="minDistance", action="store", default=None, type="int", help="minimum distance between 2 elements [format: int]")
215 parser.add_option("-D", "--maxDistance", dest="maxDistance", action="store", default=None, type="int", help="maximum distance between 2 elements [format: int]")
216 parser.add_option("-t", "--tag", dest="tagName", action="store", default="flanking", type="string", help="name of the new tag [format: string] [default: flanking]")
217 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]")
218 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
219 (options, args) = parser.parse_args()
220
221 gf = GetFlanking(options.verbosity)
222 gf.setInputFile(options.inputFileName1, options.format1, QUERY)
223 gf.setInputFile(options.inputFileName2, options.format2, REFERENCE)
224 gf.setOutputFile(options.outputFileName)
225 gf.addUpstreamDirection(options.upstream)
226 gf.addDownstreamDirection(options.downstream)
227 gf.setColinear(options.colinear)
228 gf.setAntisense(options.antisense)
229 gf.setNoOverlap(options.noOverlap)
230 gf.setMinDistance(options.minDistance)
231 gf.setMaxDistance(options.maxDistance)
232 gf.setNewTagName(options.tagName)
233 gf.run()