s_mart: smart_toolShed/SMART/Java/Python/getRandomRegions.py comparison

comparison smart_toolShed/SMART/Java/Python/getRandomRegions.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.

author	yufei-luo
date	Thu, 17 Jan 2013 10:52:14 -0500
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:e0f8dcca02ed
+#! /usr/bin/env python
+#
+# Copyright INRA-URGI 2009-2010
+#
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software. You can use,
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info".
+#
+# As a counterpart to the access to the source code and rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty and the software's author, the holder of the
+# economic rights, and the successive licensors have only limited
+# liability.
+#
+# In this respect, the user's attention is drawn to the risks associated
+# with loading, using, modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean that it is complicated to manipulate, and that also
+# therefore means that it is reserved for developers and experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or
+# data to be ensured and, more generally, to use and operate it in the
+# same conditions as regards security.
+#
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+#
+"""Find random regions in a genome"""
+import random, math
+from optparse import OptionParser
+from commons.core.parsing.FastaParser import *
+from commons.core.writer.Gff3Writer import *
+from commons.core.writer.MySqlTranscriptWriter import *
+from SMART.Java.Python.misc.Progress import *
+from SMART.Java.Python.structure.Transcript import Transcript
+from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
+repetitions = 100
+class RandomRegionsGenerator(object):
+def __init__(self, verbosity):
+self.verbosity      = verbosity
+self.strands        = False
+self.distribution   = "uniform"
+self.transcripts    = None
+self.sequenceParser = None
+random.seed()
+def setInput(self, fileName):
+self.sequenceParser = FastaParser(fileName, self.verbosity)
+def setGenomeSize(self, size):
+self.genomeSize = size
+def setChromosomeName(self, name):
+self.chromosomeName = name
+def setAnnotation(self, fileName, format):
+parser           = TranscriptContainer(fileName, format, self.verbosity)
+self.transcripts = []
+for transcript in parser.getIterator():
+self.transcripts.append(transcript)
+self.setNumber(len(self.transcripts))
+self.setSize(0)
+def setOutputFile(self, fileName):
+self.outputFileName = fileName
+def setSize(self, size):
+self.minSize = size
+self.maxSize = size
+def setMinSize(self, size):
+self.minSize = size
+def setMaxSize(self, size):
+self.maxSize = size
+def setNumber(self, number):
+self.number = number
+def setStrands(self, strands):
+self.strands = strands
+def setMaxDistribution(self, maxElements):
+if maxElements == None:
+return
+self.maxElements = maxElements
+self.distribution = "gaussian"
+def setDeviationDistribution(self, deviation):
+if deviation == None:
+return
+self.deviation = deviation
+self.distribution = "gaussian"
+def getSizes(self):
+if self.sequenceParser == None:
+self.chromosomes    = [self.chromosomeName]
+self.sizes          = {self.chromosomeName: self.genomeSize}
+self.cumulatedSize  = self.genomeSize
+self.cumulatedSizes = {self.chromosomeName: self.genomeSize}
+return
+self.chromosomes    = self.sequenceParser.getRegions()
+self.sizes          = {}
+self.cumulatedSize  = 0
+self.cumulatedSizes = {}
+for chromosome in self.chromosomes:
+self.sizes[chromosome]          = self.sequenceParser.getSizeOfRegion(chromosome)
+self.cumulatedSize             += self.sizes[chromosome]
+self.cumulatedSizes[chromosome] = self.cumulatedSize
+def findPosition(self, size = None):
+if size == None:
+size = random.randint(self.minSize, self.maxSize)
+integer = random.randint(0, self.cumulatedSize)
+for chromosome in self.chromosomes:
+if self.cumulatedSizes[chromosome] > integer:
+break
+start = random.randint(1, self.sizes[chromosome] - size)
+return (chromosome, start, size)
+def createTranscript(self, chromosome, start, size, strand, cpt):
+transcript = Transcript()
+transcript.setChromosome(chromosome)
+transcript.setStart(start)
+transcript.setEnd(start + size-1)
+transcript.setDirection(strand)
+transcript.setName("rand_%d" % (cpt))
+return transcript
+def moveTranscript(self, chromosome, start, transcript):
+while transcript.getEnd() + start - transcript.getStart() > self.cumulatedSizes[chromosome]:
+chromosome, start, size = self.findPosition(transcript.getEnd() - transcript.getStart())
+transcript.setChromosome(chromosome)
+oldStart, oldEnd = transcript.getStart(), transcript.getEnd()
+if transcript.getNbExons() > 1:
+for exon in transcript.getNbExons():
+oldExonStart, oldExonEnd = exon.getStart(), exon.getEnd()
+exon.setStart(oldExonStart + start - oldStart)
+exon.setEnd(oldExonEnd + start - oldStart)
+transcript.setStart(start)
+transcript.setEnd(oldEnd + start - oldStart)
+return [transcript]
+def createUniformCluster(self, chromosome, start, size, strand, cpt):
+transcript = self.createTranscript(chromosome, start, size, strand, cpt)
+return [transcript]
+def findNbTranscripts(self, cpt):
+return min(int(round(math.exp(random.random() * math.log(self.maxElements)))), self.number - cpt + 1)
+def getDev(self):
+deviation = 0.0
+for j in range(repetitions):
+deviation += random.randint(-self.deviation, self.deviation)
+deviation /= repetitions
+deviation  = int(round(deviation))
+return deviation
+def createGaussianCluster(self, chromosome, start, size, strand, cpt):
+transcripts   = []
+nbTranscripts = self.findNbTranscripts(cpt)
+for i in range(nbTranscripts):
+transcript = self.createTranscript(chromosome, start + self.getDev(), size + self.getDev(), strand, cpt + i)
+transcripts.append(transcript)
+return transcripts
+def writeRegions(self):
+writer     = Gff3Writer(self.outputFileName, self.verbosity)
+outputFile = open(self.outputFileName, "w")
+progress   = Progress(self.number, "Writing to %s" % (self.outputFileName), self.verbosity)
+i          = 0
+while i < self.number:
+chromosome, start, size = self.findPosition()
+strand                  = random.choice([-1, 1]) if self.strands else 1
+if self.transcripts != None:
+transcripts = self.moveTranscript(chromosome, start, self.transcripts[i])
+elif self.distribution == "uniform":
+transcripts = self.createUniformCluster(chromosome, start, size, strand, i+1)
+else:
+transcripts = self.createGaussianCluster(chromosome, start, size, strand, i+1)
+for transcript in transcripts:
+writer.addTranscript(transcript)
+i += 1
+progress.inc()
+progress.done()
+outputFile.close()
+writer.write()
+writer.close()
+def run(self):
+self.getSizes()
+self.writeRegions()
+if __name__ == "__main__":
+# parse command line
+description = "Get Random Regions v1.0.2: Get some random coordinates on a genome. May use uniform or gaussian distribution (in gaussion distribution, # of element per cluster follows a power law). [Category: Other]"
+parser = OptionParser(description = description)
+parser.add_option("-r", "--reference",     dest="reference",      action="store",      default=None,  type="string", help="file that contains the sequences [format: file in FASTA format]")
+parser.add_option("-S", "--referenceSize", dest="referenceSize",  action="store",      default=None,  type="int",    help="size of the chromosome (when no reference is given) [format: int]")
+parser.add_option("-c", "--chromosome",    dest="chromosome",     action="store",      default=None,  type="string", help="name of the chromosome (when no reference is given) [format: string]")
+parser.add_option("-o", "--output",        dest="outputFileName", action="store",                     type="string", help="output file [compulsory] [format: output file in FASTA format]")
+parser.add_option("-i", "--input",         dest="inputFileName",  action="store",      default=None,  type="string", help="optional file containing regions to shuffle [format: file in transcript format given by -f]")
+parser.add_option("-f", "--format",        dest="format",         action="store",      default=None,  type="string", help="format of the previous file [format: transcript file format]")
+parser.add_option("-s", "--size",          dest="size",           action="store",      default=None,  type="int",    help="size of the regions (if no region set is provided) [format: int]")
+parser.add_option("-z", "--minSize",       dest="minSize",        action="store",      default=None,  type="int",    help="minimum size of the regions (if no region set nor a fixed size are provided) [format: int]")
+parser.add_option("-Z", "--maxSize",       dest="maxSize",        action="store",      default=None,  type="int",    help="maximum size of the regions (if no region set nor a fixed size are provided) [format: int]")
+parser.add_option("-n", "--number",        dest="number",         action="store",      default=None,  type="int",    help="number of regions (if no region set is provided) [format: int]")
+parser.add_option("-t", "--strands",       dest="strands",        action="store_true", default=False,                help="use both strands (if no region set is provided) [format: boolean]")
+parser.add_option("-m", "--max",           dest="max",            action="store",      default=None,  type="int",    help="max. # reads in a cluster (for Gaussian dist.) [format: int]")
+parser.add_option("-d", "--deviation",     dest="deviation",      action="store",      default=None,  type="int",    help="deviation around the center of the cluster (for Gaussian dist.) [format: int]")
+parser.add_option("-v", "--verbosity",     dest="verbosity",      action="store",      default=1,     type="int",    help="trace level [format: int]")
+(options, args) = parser.parse_args()
+rrg = RandomRegionsGenerator(options.verbosity)
+if options.reference == None:
+rrg.setGenomeSize(options.referenceSize)
+rrg.setChromosomeName(options.chromosome)
+else:
+rrg.setInput(options.reference)
+rrg.setOutputFile(options.outputFileName)
+if options.inputFileName == None:
+if options.size != None:
+rrg.setSize(options.size)
+else:
+rrg.setMinSize(options.minSize)
+rrg.setMaxSize(options.maxSize)
+rrg.setNumber(options.number)
+rrg.setStrands(options.strands)
+else:
+rrg.setAnnotation(options.inputFileName, options.format)
+rrg.setMaxDistribution(options.max)
+rrg.setDeviationDistribution(options.deviation)
+rrg.run()

Mercurial > repos > yufei-luo > s_mart

comparison smart_toolShed/SMART/Java/Python/getRandomRegions.py @ 0:e0f8dcca02ed