6
+ − 1 #! /usr/bin/env python
+ − 2 #
+ − 3 # Copyright INRA-URGI 2009-2010
+ − 4 #
+ − 5 # This software is governed by the CeCILL license under French law and
+ − 6 # abiding by the rules of distribution of free software. You can use,
+ − 7 # modify and/ or redistribute the software under the terms of the CeCILL
+ − 8 # license as circulated by CEA, CNRS and INRIA at the following URL
+ − 9 # "http://www.cecill.info".
+ − 10 #
+ − 11 # As a counterpart to the access to the source code and rights to copy,
+ − 12 # modify and redistribute granted by the license, users are provided only
+ − 13 # with a limited warranty and the software's author, the holder of the
+ − 14 # economic rights, and the successive licensors have only limited
+ − 15 # liability.
+ − 16 #
+ − 17 # In this respect, the user's attention is drawn to the risks associated
+ − 18 # with loading, using, modifying and/or developing or reproducing the
+ − 19 # software by the user in light of its specific status of free software,
+ − 20 # that may mean that it is complicated to manipulate, and that also
+ − 21 # therefore means that it is reserved for developers and experienced
+ − 22 # professionals having in-depth computer knowledge. Users are therefore
+ − 23 # encouraged to load and test the software's suitability as regards their
+ − 24 # requirements in conditions enabling the security of their systems and/or
+ − 25 # data to be ensured and, more generally, to use and operate it in the
+ − 26 # same conditions as regards security.
+ − 27 #
+ − 28 # The fact that you are presently reading this means that you have had
+ − 29 # knowledge of the CeCILL license and that you accept its terms.
+ − 30 #
+ − 31 """Find random regions in a genome"""
+ − 32
+ − 33 import random, math
+ − 34 from optparse import OptionParser
+ − 35 from commons.core.parsing.FastaParser import *
+ − 36 from commons.core.writer.Gff3Writer import *
+ − 37 from commons.core.writer.MySqlTranscriptWriter import *
+ − 38 from SMART.Java.Python.misc.Progress import *
+ − 39 from SMART.Java.Python.structure.Transcript import Transcript
+ − 40 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
+ − 41
+ − 42 repetitions = 100
+ − 43
+ − 44
+ − 45 class RandomRegionsGenerator(object):
+ − 46
+ − 47 def __init__(self, verbosity):
+ − 48 self.verbosity = verbosity
+ − 49 self.strands = False
+ − 50 self.distribution = "uniform"
+ − 51 self.transcripts = None
+ − 52 self.sequenceParser = None
+ − 53 random.seed()
+ − 54
+ − 55
+ − 56 def setInput(self, fileName):
+ − 57 self.sequenceParser = FastaParser(fileName, self.verbosity)
+ − 58
+ − 59
+ − 60 def setGenomeSize(self, size):
+ − 61 self.genomeSize = size
+ − 62
+ − 63
+ − 64 def setChromosomeName(self, name):
+ − 65 self.chromosomeName = name
+ − 66
+ − 67
+ − 68 def setAnnotation(self, fileName, format):
+ − 69 parser = TranscriptContainer(fileName, format, self.verbosity)
+ − 70 self.transcripts = []
+ − 71 for transcript in parser.getIterator():
+ − 72 self.transcripts.append(transcript)
+ − 73 self.setNumber(len(self.transcripts))
+ − 74 self.setSize(0)
+ − 75
+ − 76
+ − 77 def setOutputFile(self, fileName):
+ − 78 self.outputFileName = fileName
+ − 79
+ − 80
+ − 81 def setSize(self, size):
+ − 82 self.minSize = size
+ − 83 self.maxSize = size
+ − 84
+ − 85
+ − 86 def setMinSize(self, size):
+ − 87 self.minSize = size
+ − 88
+ − 89
+ − 90 def setMaxSize(self, size):
+ − 91 self.maxSize = size
+ − 92
+ − 93
+ − 94 def setNumber(self, number):
+ − 95 self.number = number
+ − 96
+ − 97
+ − 98 def setStrands(self, strands):
+ − 99 self.strands = strands
+ − 100
+ − 101
+ − 102 def setMaxDistribution(self, maxElements):
+ − 103 if maxElements == None:
+ − 104 return
+ − 105 self.maxElements = maxElements
+ − 106 self.distribution = "gaussian"
+ − 107
+ − 108
+ − 109 def setDeviationDistribution(self, deviation):
+ − 110 if deviation == None:
+ − 111 return
+ − 112 self.deviation = deviation
+ − 113 self.distribution = "gaussian"
+ − 114
+ − 115
+ − 116 def getSizes(self):
+ − 117 if self.sequenceParser == None:
+ − 118 self.chromosomes = [self.chromosomeName]
+ − 119 self.sizes = {self.chromosomeName: self.genomeSize}
+ − 120 self.cumulatedSize = self.genomeSize
+ − 121 self.cumulatedSizes = {self.chromosomeName: self.genomeSize}
+ − 122 return
+ − 123 self.chromosomes = self.sequenceParser.getRegions()
+ − 124 self.sizes = {}
+ − 125 self.cumulatedSize = 0
+ − 126 self.cumulatedSizes = {}
+ − 127 for chromosome in self.chromosomes:
+ − 128 self.sizes[chromosome] = self.sequenceParser.getSizeOfRegion(chromosome)
+ − 129 self.cumulatedSize += self.sizes[chromosome]
+ − 130 self.cumulatedSizes[chromosome] = self.cumulatedSize
+ − 131
+ − 132
+ − 133 def findPosition(self, size = None):
+ − 134 if size == None:
+ − 135 size = random.randint(self.minSize, self.maxSize)
+ − 136 integer = random.randint(0, self.cumulatedSize)
+ − 137 for chromosome in self.chromosomes:
+ − 138 if self.cumulatedSizes[chromosome] > integer:
+ − 139 break
+ − 140 start = random.randint(1, self.sizes[chromosome] - size)
+ − 141 return (chromosome, start, size)
+ − 142
+ − 143
+ − 144 def createTranscript(self, chromosome, start, size, strand, cpt):
+ − 145 transcript = Transcript()
+ − 146 transcript.setChromosome(chromosome)
+ − 147 transcript.setStart(start)
+ − 148 transcript.setEnd(start + size-1)
+ − 149 transcript.setDirection(strand)
+ − 150 transcript.setName("rand_%d" % (cpt))
+ − 151 return transcript
+ − 152
+ − 153
+ − 154 def moveTranscript(self, chromosome, start, transcript):
+ − 155 while transcript.getEnd() + start - transcript.getStart() > self.cumulatedSizes[chromosome]:
+ − 156 chromosome, start, size = self.findPosition(transcript.getEnd() - transcript.getStart())
+ − 157 transcript.setChromosome(chromosome)
+ − 158 oldStart, oldEnd = transcript.getStart(), transcript.getEnd()
+ − 159 if transcript.getNbExons() > 1:
+ − 160 for exon in transcript.getNbExons():
+ − 161 oldExonStart, oldExonEnd = exon.getStart(), exon.getEnd()
+ − 162 exon.setStart(oldExonStart + start - oldStart)
+ − 163 exon.setEnd(oldExonEnd + start - oldStart)
+ − 164 transcript.setStart(start)
+ − 165 transcript.setEnd(oldEnd + start - oldStart)
+ − 166 return [transcript]
+ − 167
+ − 168
+ − 169 def createUniformCluster(self, chromosome, start, size, strand, cpt):
+ − 170 transcript = self.createTranscript(chromosome, start, size, strand, cpt)
+ − 171 return [transcript]
+ − 172
+ − 173
+ − 174 def findNbTranscripts(self, cpt):
+ − 175 return min(int(round(math.exp(random.random() * math.log(self.maxElements)))), self.number - cpt + 1)
+ − 176
+ − 177
+ − 178 def getDev(self):
+ − 179 deviation = 0.0
+ − 180 for j in range(repetitions):
+ − 181 deviation += random.randint(-self.deviation, self.deviation)
+ − 182 deviation /= repetitions
+ − 183 deviation = int(round(deviation))
+ − 184 return deviation
+ − 185
+ − 186
+ − 187 def createGaussianCluster(self, chromosome, start, size, strand, cpt):
+ − 188 transcripts = []
+ − 189 nbTranscripts = self.findNbTranscripts(cpt)
+ − 190 for i in range(nbTranscripts):
+ − 191 transcript = self.createTranscript(chromosome, start + self.getDev(), size + self.getDev(), strand, cpt + i)
+ − 192 transcripts.append(transcript)
+ − 193 return transcripts
+ − 194
+ − 195
+ − 196 def writeRegions(self):
+ − 197 writer = Gff3Writer(self.outputFileName, self.verbosity)
+ − 198 outputFile = open(self.outputFileName, "w")
+ − 199 progress = Progress(self.number, "Writing to %s" % (self.outputFileName), self.verbosity)
+ − 200 i = 0
+ − 201 while i < self.number:
+ − 202 chromosome, start, size = self.findPosition()
+ − 203 strand = random.choice([-1, 1]) if self.strands else 1
+ − 204 if self.transcripts != None:
+ − 205 transcripts = self.moveTranscript(chromosome, start, self.transcripts[i])
+ − 206 elif self.distribution == "uniform":
+ − 207 transcripts = self.createUniformCluster(chromosome, start, size, strand, i+1)
+ − 208 else:
+ − 209 transcripts = self.createGaussianCluster(chromosome, start, size, strand, i+1)
+ − 210 for transcript in transcripts:
+ − 211 writer.addTranscript(transcript)
+ − 212 i += 1
+ − 213 progress.inc()
+ − 214 progress.done()
+ − 215 outputFile.close()
+ − 216 writer.write()
+ − 217 writer.close()
+ − 218
+ − 219
+ − 220 def run(self):
+ − 221 self.getSizes()
+ − 222 self.writeRegions()
+ − 223
+ − 224
+ − 225 if __name__ == "__main__":
+ − 226
+ − 227 # parse command line
+ − 228 description = "Get Random Regions v1.0.2: Get some random coordinates on a genome. May use uniform or gaussian distribution (in gaussion distribution, # of element per cluster follows a power law). [Category: Other]"
+ − 229
+ − 230 parser = OptionParser(description = description)
+ − 231 parser.add_option("-r", "--reference", dest="reference", action="store", default=None, type="string", help="file that contains the sequences [format: file in FASTA format]")
+ − 232 parser.add_option("-S", "--referenceSize", dest="referenceSize", action="store", default=None, type="int", help="size of the chromosome (when no reference is given) [format: int]")
+ − 233 parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="name of the chromosome (when no reference is given) [format: string]")
+ − 234 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in FASTA format]")
+ − 235 parser.add_option("-i", "--input", dest="inputFileName", action="store", default=None, type="string", help="optional file containing regions to shuffle [format: file in transcript format given by -f]")
+ − 236 parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the previous file [format: transcript file format]")
+ − 237 parser.add_option("-s", "--size", dest="size", action="store", default=None, type="int", help="size of the regions (if no region set is provided) [format: int]")
+ − 238 parser.add_option("-z", "--minSize", dest="minSize", action="store", default=None, type="int", help="minimum size of the regions (if no region set nor a fixed size are provided) [format: int]")
+ − 239 parser.add_option("-Z", "--maxSize", dest="maxSize", action="store", default=None, type="int", help="maximum size of the regions (if no region set nor a fixed size are provided) [format: int]")
+ − 240 parser.add_option("-n", "--number", dest="number", action="store", default=None, type="int", help="number of regions (if no region set is provided) [format: int]")
+ − 241 parser.add_option("-t", "--strands", dest="strands", action="store_true", default=False, help="use both strands (if no region set is provided) [format: boolean]")
+ − 242 parser.add_option("-m", "--max", dest="max", action="store", default=None, type="int", help="max. # reads in a cluster (for Gaussian dist.) [format: int]")
+ − 243 parser.add_option("-d", "--deviation", dest="deviation", action="store", default=None, type="int", help="deviation around the center of the cluster (for Gaussian dist.) [format: int]")
+ − 244 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
+ − 245 (options, args) = parser.parse_args()
+ − 246
+ − 247 rrg = RandomRegionsGenerator(options.verbosity)
+ − 248 if options.reference == None:
+ − 249 rrg.setGenomeSize(options.referenceSize)
+ − 250 rrg.setChromosomeName(options.chromosome)
+ − 251 else:
+ − 252 rrg.setInput(options.reference)
+ − 253 rrg.setOutputFile(options.outputFileName)
+ − 254 if options.inputFileName == None:
+ − 255 if options.size != None:
+ − 256 rrg.setSize(options.size)
+ − 257 else:
+ − 258 rrg.setMinSize(options.minSize)
+ − 259 rrg.setMaxSize(options.maxSize)
+ − 260 rrg.setNumber(options.number)
+ − 261 rrg.setStrands(options.strands)
+ − 262 else:
+ − 263 rrg.setAnnotation(options.inputFileName, options.format)
+ − 264 rrg.setMaxDistribution(options.max)
+ − 265 rrg.setDeviationDistribution(options.deviation)
+ − 266 rrg.run()
+ − 267