| 6 | 1 #! /usr/bin/env python | 
|  | 2 # | 
|  | 3 # Copyright INRA-URGI 2009-2011 | 
|  | 4 # | 
|  | 5 # This software is governed by the CeCILL license under French law and | 
|  | 6 # abiding by the rules of distribution of free software. You can use, | 
|  | 7 # modify and/ or redistribute the software under the terms of the CeCILL | 
|  | 8 # license as circulated by CEA, CNRS and INRIA at the following URL | 
|  | 9 # "http://www.cecill.info". | 
|  | 10 # | 
|  | 11 # As a counterpart to the access to the source code and rights to copy, | 
|  | 12 # modify and redistribute granted by the license, users are provided only | 
|  | 13 # with a limited warranty and the software's author, the holder of the | 
|  | 14 # economic rights, and the successive licensors have only limited | 
|  | 15 # liability. | 
|  | 16 # | 
|  | 17 # In this respect, the user's attention is drawn to the risks associated | 
|  | 18 # with loading, using, modifying and/or developing or reproducing the | 
|  | 19 # software by the user in light of its specific status of free software, | 
|  | 20 # that may mean that it is complicated to manipulate, and that also | 
|  | 21 # therefore means that it is reserved for developers and experienced | 
|  | 22 # professionals having in-depth computer knowledge. Users are therefore | 
|  | 23 # encouraged to load and test the software's suitability as regards their | 
|  | 24 # requirements in conditions enabling the security of their systems and/or | 
|  | 25 # data to be ensured and, more generally, to use and operate it in the | 
|  | 26 # same conditions as regards security. | 
|  | 27 # | 
|  | 28 # The fact that you are presently reading this means that you have had | 
|  | 29 # knowledge of the CeCILL license and that you accept its terms. | 
|  | 30 # | 
|  | 31 from optparse import OptionParser | 
|  | 32 from commons.core.parsing.ParserChooser import ParserChooser | 
|  | 33 from commons.core.writer.TranscriptWriter import TranscriptWriter | 
|  | 34 from SMART.Java.Python.structure.Transcript import Transcript | 
|  | 35 from SMART.Java.Python.structure.Interval import Interval | 
|  | 36 from SMART.Java.Python.misc.Progress import Progress | 
|  | 37 | 
|  | 38 QUERY        = 0 | 
|  | 39 REFERENCE    = 1 | 
|  | 40 INPUTS       = (QUERY, REFERENCE) | 
|  | 41 STRANDS      = (-1, 1) | 
|  | 42 TAG_DISTANCE = "distance_" | 
|  | 43 TAG_SENSE    = "_sense" | 
|  | 44 TAG_REGION   = "_region" | 
|  | 45 TAGS_REGION  = {-1: "_upstream", 0: "", 1: "_downstream"} | 
|  | 46 TAGS_RREGION = {-1: "upstream", 0: "overlapping", 1: "downstream"} | 
|  | 47 TAGS_SENSE   = {-1: "antisense", 0: "", 1: "colinear"} | 
|  | 48 STRANDSTOSTR = {-1: "(-)", 0: "", 1: "(+)"} | 
|  | 49 | 
|  | 50 | 
|  | 51 def getOrderKey(transcript, direction): | 
|  | 52     if direction == 1: | 
|  | 53         return transcript.getEnd() | 
|  | 54     return - transcript.getStart() | 
|  | 55 | 
|  | 56 def isInGoodRegion(transcriptRef, transcriptQuery, direction): | 
|  | 57     if direction == 1: | 
|  | 58         return transcriptQuery.getEnd() > transcriptRef.getEnd() | 
|  | 59     return transcriptQuery.getStart() < transcriptRef.getStart() | 
|  | 60 | 
|  | 61 | 
|  | 62 class GetFlanking(object): | 
|  | 63 | 
|  | 64     def __init__(self, verbosity): | 
|  | 65         self.verbosity   = verbosity | 
|  | 66         self.transcripts = dict([id, {}] for id in INPUTS) | 
|  | 67         self.directions  = [] | 
|  | 68         self.noOverlap   = False | 
|  | 69         self.colinear    = False | 
|  | 70         self.antisense   = False | 
|  | 71         self.distance    = None | 
|  | 72         self.minDistance = None | 
|  | 73         self.maxDistance = None | 
|  | 74         self.tagName     = "flanking" | 
|  | 75 | 
|  | 76     def setInputFile(self, fileName, format, id): | 
|  | 77         chooser = ParserChooser(self.verbosity) | 
|  | 78         chooser.findFormat(format) | 
|  | 79         parser = chooser.getParser(fileName) | 
|  | 80         for transcript in parser.getIterator(): | 
|  | 81             chromosome = transcript.getChromosome() | 
|  | 82             if chromosome not in self.transcripts[id]: | 
|  | 83                 self.transcripts[id][chromosome] = [] | 
|  | 84             self.transcripts[id][chromosome].append(transcript) | 
|  | 85 | 
|  | 86     def setOutputFile(self, fileName): | 
|  | 87         self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) | 
|  | 88 | 
|  | 89     def addUpstreamDirection(self, upstream): | 
|  | 90         if upstream: | 
|  | 91             self.directions.append(-1) | 
|  | 92 | 
|  | 93     def addDownstreamDirection(self, downstream): | 
|  | 94         if downstream: | 
|  | 95             self.directions.append(1) | 
|  | 96 | 
|  | 97     def setColinear(self, colinear): | 
|  | 98         self.colinear = colinear | 
|  | 99 | 
|  | 100     def setAntisense(self, antisense): | 
|  | 101         self.antisense = antisense | 
|  | 102 | 
|  | 103     def setNoOverlap(self, noOverlap): | 
|  | 104         self.noOverlap = noOverlap | 
|  | 105 | 
|  | 106     def setMinDistance(self, distance): | 
|  | 107         self.minDistance = distance | 
|  | 108 | 
|  | 109     def setMaxDistance(self, distance): | 
|  | 110         self.maxDistance = distance | 
|  | 111 | 
|  | 112     def setNewTagName(self, tagName): | 
|  | 113         self.tagName = tagName | 
|  | 114 | 
|  | 115     def match(self, transcriptRef, transcriptQuery, direction): | 
|  | 116         if self.noOverlap and transcriptRef.overlapWith(transcriptQuery): | 
|  | 117             return False | 
|  | 118         if self.colinear and transcriptRef.getDirection() != transcriptQuery.getDirection(): | 
|  | 119             return False | 
|  | 120         if self.antisense and transcriptRef.getDirection() == transcriptQuery.getDirection(): | 
|  | 121             return False | 
|  | 122         if self.minDistance != None or self.maxDistance != None: | 
|  | 123             distance = transcriptRef.getDistance(transcriptQuery) | 
|  | 124             if self.minDistance != None and distance < self.minDistance: | 
|  | 125                 return False | 
|  | 126             if self.maxDistance != None and distance > self.maxDistance: | 
|  | 127                 return False | 
|  | 128         return True | 
|  | 129 | 
|  | 130     def getFlanking(self, direction): | 
|  | 131         for chromosome in sorted(self.transcripts[REFERENCE].keys()): | 
|  | 132             if chromosome not in self.transcripts[QUERY]: | 
|  | 133                 continue | 
|  | 134             sortedTranscripts = dict([id, {}] for id in INPUTS) | 
|  | 135             for id in INPUTS: | 
|  | 136                 sortedTranscripts[id] = sorted(self.transcripts[id][chromosome], key = lambda t: getOrderKey(t, direction)) | 
|  | 137             refIndex    = 0 | 
|  | 138             currentRefs = [] | 
|  | 139             outputs     = set() | 
|  | 140             progress    = Progress(len(sortedTranscripts[QUERY]), "Reading chr %s %s" % (chromosome, STRANDSTOSTR[direction]), self.verbosity) | 
|  | 141             for query in sortedTranscripts[QUERY]: | 
|  | 142                 while refIndex < len(sortedTranscripts[REFERENCE]) and isInGoodRegion(sortedTranscripts[REFERENCE][refIndex], query, direction): | 
|  | 143                     currentRefs.append(sortedTranscripts[REFERENCE][refIndex]) | 
|  | 144                     refIndex += 1 | 
|  | 145                 nextCurrentRefs = [] | 
|  | 146                 for currentRef in currentRefs: | 
|  | 147                     if self.match(currentRef, query, direction): | 
|  | 148                         if currentRef not in self.flankings: | 
|  | 149                             self.flankings[currentRef] = {} | 
|  | 150                         self.flankings[currentRef][direction * currentRef.getDirection()] = query | 
|  | 151                     else: | 
|  | 152                         nextCurrentRefs.append(currentRef) | 
|  | 153                 currentRefs = nextCurrentRefs | 
|  | 154                 progress.inc() | 
|  | 155             progress.done() | 
|  | 156 | 
|  | 157     def setTags(self, query, reference, direction): | 
|  | 158         refName = reference.getTagValue("ID") | 
|  | 159         if refName == None: | 
|  | 160             refName = reference.getName() | 
|  | 161         if refName == None: | 
|  | 162             refName = reference.__str__() | 
|  | 163         query.setTagValue("%s%s" % (self.tagName, TAGS_REGION[direction]), refName) | 
|  | 164         query.setTagValue("%s_%s%s" % (TAG_DISTANCE, self.tagName, TAGS_REGION[direction]), query.getDistance(reference)) | 
|  | 165         if direction == 0: | 
|  | 166             query.setTagValue("%s_%s" % (TAG_SENSE, self.tagName), TAGS_SENSE[query.getDirection() * reference.getDirection()]) | 
|  | 167             query.setTagValue("%s_%s" % (TAG_REGION, self.tagName), TAGS_RREGION[cmp(query.getRelativeDistance(reference), 0)]) | 
|  | 168         for tag in reference.getTagNames(): | 
|  | 169             if tag not in ("quality", "feature"): | 
|  | 170                 query.setTagValue("%s%s_%s" % (self.tagName, TAGS_REGION[direction], tag), reference.getTagValue(tag)) | 
|  | 171         return query | 
|  | 172 | 
|  | 173     def write(self): | 
|  | 174         outputs   = set() | 
|  | 175         progress  = Progress(len(self.flankings.keys()), "Printing data", self.verbosity) | 
|  | 176         for transcriptRef in self.flankings.keys(): | 
|  | 177             if self.directions: | 
|  | 178                 for direction in self.directions: | 
|  | 179                     if direction in self.flankings[transcriptRef]: | 
|  | 180                         query = self.flankings[transcriptRef][direction] | 
|  | 181                         outputs.add(self.setTags(query, transcriptRef, direction)) | 
|  | 182             else: | 
|  | 183                 if self.flankings[transcriptRef]: | 
|  | 184                     query = sorted(self.flankings[transcriptRef].values(), key = lambda query: query.getDistance(transcriptRef))[0] | 
|  | 185                     outputs.add(self.setTags(query, transcriptRef, 0)) | 
|  | 186             progress.inc() | 
|  | 187         for transcript in sorted(list(outputs), key = lambda flanking: (flanking.getChromosome(), flanking.getStart(), flanking.getEnd())): | 
|  | 188             self.writer.addTranscript(transcript) | 
|  | 189         self.writer.close() | 
|  | 190         progress.done() | 
|  | 191 | 
|  | 192     def run(self): | 
|  | 193         self.flankings = {} | 
|  | 194         for direction in STRANDS: | 
|  | 195             self.getFlanking(direction) | 
|  | 196         self.write() | 
|  | 197 | 
|  | 198 if __name__ == "__main__": | 
|  | 199 | 
|  | 200     description = "Get Flanking v1.0.1: Get the flanking regions of a set of reference. [Category: Data Selection]" | 
|  | 201 | 
|  | 202     parser = OptionParser(description = description) | 
|  | 203     parser.add_option("-i", "--input1",      dest="inputFileName1", action="store",                          type="string", help="query input file [compulsory] [format: file in transcript format given by -f]") | 
|  | 204     parser.add_option("-f", "--format1",     dest="format1",        action="store",                          type="string", help="format of previous file [compulsory] [format: transcript file format]") | 
|  | 205     parser.add_option("-j", "--input2",      dest="inputFileName2", action="store",                          type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]") | 
|  | 206     parser.add_option("-g", "--format2",     dest="format2",        action="store",                          type="string", help="format of previous file [compulsory] [format: transcript file format]") | 
|  | 207     parser.add_option("-5", "--upstream",    dest="upstream",       action="store_true", default=False,                     help="output upstream elements [format: boolean] [default: False]") | 
|  | 208     parser.add_option("-3", "--downstream",  dest="downstream",     action="store_true", default=False,                     help="output downstream elements [format: boolean] [default: False]") | 
|  | 209     parser.add_option("-c", "--colinear",    dest="colinear",       action="store_true", default=False,                     help="find first colinear element [format: boolean] [default: False]") | 
|  | 210     parser.add_option("-a", "--antisense",   dest="antisense",      action="store_true", default=False,                     help="find first anti-sense element [format: boolean] [default: False]") | 
|  | 211     parser.add_option("-e", "--noOverlap",   dest="noOverlap",      action="store_true", default=False,                     help="do not consider elements which are overlapping reference elements [format: boolean] [default: False]") | 
|  | 212     parser.add_option("-d", "--minDistance", dest="minDistance",    action="store",      default=None,       type="int",    help="minimum distance between 2 elements [format: int]") | 
|  | 213     parser.add_option("-D", "--maxDistance", dest="maxDistance",    action="store",      default=None,       type="int",    help="maximum distance between 2 elements [format: int]") | 
|  | 214     parser.add_option("-t", "--tag",         dest="tagName",        action="store",      default="flanking", type="string", help="name of the new tag [format: string] [default: flanking]") | 
|  | 215     parser.add_option("-o", "--output",      dest="outputFileName", action="store",                          type="string", help="output file [format: output file in GFF3 format]") | 
|  | 216     parser.add_option("-v", "--verbosity",   dest="verbosity",      action="store",      default=1,          type="int",    help="trace level [format: int]") | 
|  | 217     (options, args) = parser.parse_args() | 
|  | 218 | 
|  | 219     gf = GetFlanking(options.verbosity) | 
|  | 220     gf.setInputFile(options.inputFileName1, options.format1, QUERY) | 
|  | 221     gf.setInputFile(options.inputFileName2, options.format2, REFERENCE) | 
|  | 222     gf.setOutputFile(options.outputFileName) | 
|  | 223     gf.addUpstreamDirection(options.upstream) | 
|  | 224     gf.addDownstreamDirection(options.downstream) | 
|  | 225     gf.setColinear(options.colinear) | 
|  | 226     gf.setAntisense(options.antisense) | 
|  | 227     gf.setNoOverlap(options.noOverlap) | 
|  | 228     gf.setMinDistance(options.minDistance) | 
|  | 229     gf.setMaxDistance(options.maxDistance) | 
|  | 230     gf.setNewTagName(options.tagName) | 
|  | 231     gf.run() |