diff SMART/Java/Python/getRandomRegions.py @ 6:769e306b7933

Change the repository level.
author yufei-luo
date Fri, 18 Jan 2013 04:54:14 -0500
parents
children 169d364ddd91
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SMART/Java/Python/getRandomRegions.py	Fri Jan 18 04:54:14 2013 -0500
@@ -0,0 +1,267 @@
+#! /usr/bin/env python
+#
+# Copyright INRA-URGI 2009-2010
+# 
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software. You can use,
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info".
+# 
+# As a counterpart to the access to the source code and rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty and the software's author, the holder of the
+# economic rights, and the successive licensors have only limited
+# liability.
+# 
+# In this respect, the user's attention is drawn to the risks associated
+# with loading, using, modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean that it is complicated to manipulate, and that also
+# therefore means that it is reserved for developers and experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or
+# data to be ensured and, more generally, to use and operate it in the
+# same conditions as regards security.
+# 
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+#
+"""Find random regions in a genome"""
+
+import random, math
+from optparse import OptionParser
+from commons.core.parsing.FastaParser import *
+from commons.core.writer.Gff3Writer import *
+from commons.core.writer.MySqlTranscriptWriter import *
+from SMART.Java.Python.misc.Progress import *
+from SMART.Java.Python.structure.Transcript import Transcript
+from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
+
+repetitions = 100
+
+
+class RandomRegionsGenerator(object):
+
+    def __init__(self, verbosity):
+        self.verbosity      = verbosity
+        self.strands        = False
+        self.distribution   = "uniform"
+        self.transcripts    = None
+        self.sequenceParser = None
+        random.seed()
+
+
+    def setInput(self, fileName):
+        self.sequenceParser = FastaParser(fileName, self.verbosity)
+
+
+    def setGenomeSize(self, size):
+        self.genomeSize = size
+
+
+    def setChromosomeName(self, name):
+        self.chromosomeName = name
+
+
+    def setAnnotation(self, fileName, format):
+        parser           = TranscriptContainer(fileName, format, self.verbosity)
+        self.transcripts = []
+        for transcript in parser.getIterator():
+            self.transcripts.append(transcript)
+        self.setNumber(len(self.transcripts))
+        self.setSize(0)
+
+
+    def setOutputFile(self, fileName):
+        self.outputFileName = fileName
+
+
+    def setSize(self, size):
+        self.minSize = size
+        self.maxSize = size
+
+
+    def setMinSize(self, size):
+        self.minSize = size
+
+
+    def setMaxSize(self, size):
+        self.maxSize = size
+
+
+    def setNumber(self, number):
+        self.number = number
+
+
+    def setStrands(self, strands):
+        self.strands = strands
+
+
+    def setMaxDistribution(self, maxElements):
+        if maxElements == None:
+            return
+        self.maxElements = maxElements
+        self.distribution = "gaussian"
+
+
+    def setDeviationDistribution(self, deviation):
+        if deviation == None:
+            return
+        self.deviation = deviation
+        self.distribution = "gaussian"
+
+
+    def getSizes(self):
+        if self.sequenceParser == None:
+            self.chromosomes    = [self.chromosomeName]
+            self.sizes          = {self.chromosomeName: self.genomeSize}
+            self.cumulatedSize  = self.genomeSize
+            self.cumulatedSizes = {self.chromosomeName: self.genomeSize}
+            return
+        self.chromosomes    = self.sequenceParser.getRegions()
+        self.sizes          = {}
+        self.cumulatedSize  = 0
+        self.cumulatedSizes = {}
+        for chromosome in self.chromosomes:
+            self.sizes[chromosome]          = self.sequenceParser.getSizeOfRegion(chromosome)
+            self.cumulatedSize             += self.sizes[chromosome]
+            self.cumulatedSizes[chromosome] = self.cumulatedSize
+
+
+    def findPosition(self, size = None):
+        if size == None:
+            size = random.randint(self.minSize, self.maxSize)
+        integer = random.randint(0, self.cumulatedSize)
+        for chromosome in self.chromosomes:
+            if self.cumulatedSizes[chromosome] > integer:
+                break
+        start = random.randint(1, self.sizes[chromosome] - size)
+        return (chromosome, start, size)
+
+
+    def createTranscript(self, chromosome, start, size, strand, cpt):
+        transcript = Transcript()
+        transcript.setChromosome(chromosome)
+        transcript.setStart(start)
+        transcript.setEnd(start + size-1)
+        transcript.setDirection(strand)
+        transcript.setName("rand_%d" % (cpt))
+        return transcript
+
+
+    def moveTranscript(self, chromosome, start, transcript):
+        while transcript.getEnd() + start - transcript.getStart() > self.cumulatedSizes[chromosome]:
+            chromosome, start, size = self.findPosition(transcript.getEnd() - transcript.getStart())
+        transcript.setChromosome(chromosome)
+        oldStart, oldEnd = transcript.getStart(), transcript.getEnd()
+        if transcript.getNbExons() > 1:
+            for exon in transcript.getNbExons():
+                oldExonStart, oldExonEnd = exon.getStart(), exon.getEnd()
+                exon.setStart(oldExonStart + start - oldStart)
+                exon.setEnd(oldExonEnd + start - oldStart)
+        transcript.setStart(start)
+        transcript.setEnd(oldEnd + start - oldStart)
+        return [transcript]
+
+
+    def createUniformCluster(self, chromosome, start, size, strand, cpt):
+        transcript = self.createTranscript(chromosome, start, size, strand, cpt)
+        return [transcript]
+
+
+    def findNbTranscripts(self, cpt):
+        return min(int(round(math.exp(random.random() * math.log(self.maxElements)))), self.number - cpt + 1)
+
+
+    def getDev(self):
+        deviation = 0.0
+        for j in range(repetitions):
+            deviation += random.randint(-self.deviation, self.deviation)
+        deviation /= repetitions
+        deviation  = int(round(deviation))
+        return deviation
+
+
+    def createGaussianCluster(self, chromosome, start, size, strand, cpt):
+        transcripts   = []
+        nbTranscripts = self.findNbTranscripts(cpt)
+        for i in range(nbTranscripts):
+            transcript = self.createTranscript(chromosome, start + self.getDev(), size + self.getDev(), strand, cpt + i)
+            transcripts.append(transcript)
+        return transcripts
+
+
+    def writeRegions(self):
+        writer     = Gff3Writer(self.outputFileName, self.verbosity)
+        outputFile = open(self.outputFileName, "w")
+        progress   = Progress(self.number, "Writing to %s" % (self.outputFileName), self.verbosity)
+        i          = 0
+        while i < self.number:
+            chromosome, start, size = self.findPosition()
+            strand                  = random.choice([-1, 1]) if self.strands else 1
+            if self.transcripts != None:
+                transcripts = self.moveTranscript(chromosome, start, self.transcripts[i])
+            elif self.distribution == "uniform":
+                transcripts = self.createUniformCluster(chromosome, start, size, strand, i+1)
+            else:
+                transcripts = self.createGaussianCluster(chromosome, start, size, strand, i+1)
+            for transcript in transcripts:
+                writer.addTranscript(transcript)
+                i += 1
+                progress.inc()
+        progress.done()
+        outputFile.close()
+        writer.write()
+        writer.close()
+
+
+    def run(self):
+        self.getSizes()
+        self.writeRegions()
+
+
+if __name__ == "__main__":
+    
+    # parse command line
+    description = "Get Random Regions v1.0.2: Get some random coordinates on a genome. May use uniform or gaussian distribution (in gaussion distribution, # of element per cluster follows a power law). [Category: Other]"
+
+    parser = OptionParser(description = description)
+    parser.add_option("-r", "--reference",     dest="reference",      action="store",      default=None,  type="string", help="file that contains the sequences [format: file in FASTA format]")
+    parser.add_option("-S", "--referenceSize", dest="referenceSize",  action="store",      default=None,  type="int",    help="size of the chromosome (when no reference is given) [format: int]")
+    parser.add_option("-c", "--chromosome",    dest="chromosome",     action="store",      default=None,  type="string", help="name of the chromosome (when no reference is given) [format: string]")
+    parser.add_option("-o", "--output",        dest="outputFileName", action="store",                     type="string", help="output file [compulsory] [format: output file in FASTA format]")
+    parser.add_option("-i", "--input",         dest="inputFileName",  action="store",      default=None,  type="string", help="optional file containing regions to shuffle [format: file in transcript format given by -f]")
+    parser.add_option("-f", "--format",        dest="format",         action="store",      default=None,  type="string", help="format of the previous file [format: transcript file format]")
+    parser.add_option("-s", "--size",          dest="size",           action="store",      default=None,  type="int",    help="size of the regions (if no region set is provided) [format: int]")
+    parser.add_option("-z", "--minSize",       dest="minSize",        action="store",      default=None,  type="int",    help="minimum size of the regions (if no region set nor a fixed size are provided) [format: int]")
+    parser.add_option("-Z", "--maxSize",       dest="maxSize",        action="store",      default=None,  type="int",    help="maximum size of the regions (if no region set nor a fixed size are provided) [format: int]")
+    parser.add_option("-n", "--number",        dest="number",         action="store",      default=None,  type="int",    help="number of regions (if no region set is provided) [format: int]")
+    parser.add_option("-t", "--strands",       dest="strands",        action="store_true", default=False,                help="use both strands (if no region set is provided) [format: boolean]")
+    parser.add_option("-m", "--max",           dest="max",            action="store",      default=None,  type="int",    help="max. # reads in a cluster (for Gaussian dist.) [format: int]")
+    parser.add_option("-d", "--deviation",     dest="deviation",      action="store",      default=None,  type="int",    help="deviation around the center of the cluster (for Gaussian dist.) [format: int]")
+    parser.add_option("-v", "--verbosity",     dest="verbosity",      action="store",      default=1,     type="int",    help="trace level [format: int]")
+    (options, args) = parser.parse_args()
+
+    rrg = RandomRegionsGenerator(options.verbosity)
+    if options.reference == None:
+        rrg.setGenomeSize(options.referenceSize)
+        rrg.setChromosomeName(options.chromosome)
+    else:
+        rrg.setInput(options.reference)
+    rrg.setOutputFile(options.outputFileName)
+    if options.inputFileName == None:
+        if options.size != None:
+            rrg.setSize(options.size)
+        else:
+            rrg.setMinSize(options.minSize)
+            rrg.setMaxSize(options.maxSize)
+        rrg.setNumber(options.number)
+        rrg.setStrands(options.strands)
+    else:
+        rrg.setAnnotation(options.inputFileName, options.format)
+    rrg.setMaxDistribution(options.max)
+    rrg.setDeviationDistribution(options.deviation)
+    rrg.run()
+