Mercurial > repos > yufei-luo > s_mart
view SMART/Java/Python/getRandomRegions.py @ 44:5f796c5c579f
Uploaded
author | m-zytnicki |
---|---|
date | Wed, 18 Sep 2013 08:32:38 -0400 |
parents | 769e306b7933 |
children | 169d364ddd91 |
line wrap: on
line source
#! /usr/bin/env python # # Copyright INRA-URGI 2009-2010 # # This software is governed by the CeCILL license under French law and # abiding by the rules of distribution of free software. You can use, # modify and/ or redistribute the software under the terms of the CeCILL # license as circulated by CEA, CNRS and INRIA at the following URL # "http://www.cecill.info". # # As a counterpart to the access to the source code and rights to copy, # modify and redistribute granted by the license, users are provided only # with a limited warranty and the software's author, the holder of the # economic rights, and the successive licensors have only limited # liability. # # In this respect, the user's attention is drawn to the risks associated # with loading, using, modifying and/or developing or reproducing the # software by the user in light of its specific status of free software, # that may mean that it is complicated to manipulate, and that also # therefore means that it is reserved for developers and experienced # professionals having in-depth computer knowledge. Users are therefore # encouraged to load and test the software's suitability as regards their # requirements in conditions enabling the security of their systems and/or # data to be ensured and, more generally, to use and operate it in the # same conditions as regards security. # # The fact that you are presently reading this means that you have had # knowledge of the CeCILL license and that you accept its terms. # """Find random regions in a genome""" import random, math from optparse import OptionParser from commons.core.parsing.FastaParser import * from commons.core.writer.Gff3Writer import * from commons.core.writer.MySqlTranscriptWriter import * from SMART.Java.Python.misc.Progress import * from SMART.Java.Python.structure.Transcript import Transcript from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer repetitions = 100 class RandomRegionsGenerator(object): def __init__(self, verbosity): self.verbosity = verbosity self.strands = False self.distribution = "uniform" self.transcripts = None self.sequenceParser = None random.seed() def setInput(self, fileName): self.sequenceParser = FastaParser(fileName, self.verbosity) def setGenomeSize(self, size): self.genomeSize = size def setChromosomeName(self, name): self.chromosomeName = name def setAnnotation(self, fileName, format): parser = TranscriptContainer(fileName, format, self.verbosity) self.transcripts = [] for transcript in parser.getIterator(): self.transcripts.append(transcript) self.setNumber(len(self.transcripts)) self.setSize(0) def setOutputFile(self, fileName): self.outputFileName = fileName def setSize(self, size): self.minSize = size self.maxSize = size def setMinSize(self, size): self.minSize = size def setMaxSize(self, size): self.maxSize = size def setNumber(self, number): self.number = number def setStrands(self, strands): self.strands = strands def setMaxDistribution(self, maxElements): if maxElements == None: return self.maxElements = maxElements self.distribution = "gaussian" def setDeviationDistribution(self, deviation): if deviation == None: return self.deviation = deviation self.distribution = "gaussian" def getSizes(self): if self.sequenceParser == None: self.chromosomes = [self.chromosomeName] self.sizes = {self.chromosomeName: self.genomeSize} self.cumulatedSize = self.genomeSize self.cumulatedSizes = {self.chromosomeName: self.genomeSize} return self.chromosomes = self.sequenceParser.getRegions() self.sizes = {} self.cumulatedSize = 0 self.cumulatedSizes = {} for chromosome in self.chromosomes: self.sizes[chromosome] = self.sequenceParser.getSizeOfRegion(chromosome) self.cumulatedSize += self.sizes[chromosome] self.cumulatedSizes[chromosome] = self.cumulatedSize def findPosition(self, size = None): if size == None: size = random.randint(self.minSize, self.maxSize) integer = random.randint(0, self.cumulatedSize) for chromosome in self.chromosomes: if self.cumulatedSizes[chromosome] > integer: break start = random.randint(1, self.sizes[chromosome] - size) return (chromosome, start, size) def createTranscript(self, chromosome, start, size, strand, cpt): transcript = Transcript() transcript.setChromosome(chromosome) transcript.setStart(start) transcript.setEnd(start + size-1) transcript.setDirection(strand) transcript.setName("rand_%d" % (cpt)) return transcript def moveTranscript(self, chromosome, start, transcript): while transcript.getEnd() + start - transcript.getStart() > self.cumulatedSizes[chromosome]: chromosome, start, size = self.findPosition(transcript.getEnd() - transcript.getStart()) transcript.setChromosome(chromosome) oldStart, oldEnd = transcript.getStart(), transcript.getEnd() if transcript.getNbExons() > 1: for exon in transcript.getNbExons(): oldExonStart, oldExonEnd = exon.getStart(), exon.getEnd() exon.setStart(oldExonStart + start - oldStart) exon.setEnd(oldExonEnd + start - oldStart) transcript.setStart(start) transcript.setEnd(oldEnd + start - oldStart) return [transcript] def createUniformCluster(self, chromosome, start, size, strand, cpt): transcript = self.createTranscript(chromosome, start, size, strand, cpt) return [transcript] def findNbTranscripts(self, cpt): return min(int(round(math.exp(random.random() * math.log(self.maxElements)))), self.number - cpt + 1) def getDev(self): deviation = 0.0 for j in range(repetitions): deviation += random.randint(-self.deviation, self.deviation) deviation /= repetitions deviation = int(round(deviation)) return deviation def createGaussianCluster(self, chromosome, start, size, strand, cpt): transcripts = [] nbTranscripts = self.findNbTranscripts(cpt) for i in range(nbTranscripts): transcript = self.createTranscript(chromosome, start + self.getDev(), size + self.getDev(), strand, cpt + i) transcripts.append(transcript) return transcripts def writeRegions(self): writer = Gff3Writer(self.outputFileName, self.verbosity) outputFile = open(self.outputFileName, "w") progress = Progress(self.number, "Writing to %s" % (self.outputFileName), self.verbosity) i = 0 while i < self.number: chromosome, start, size = self.findPosition() strand = random.choice([-1, 1]) if self.strands else 1 if self.transcripts != None: transcripts = self.moveTranscript(chromosome, start, self.transcripts[i]) elif self.distribution == "uniform": transcripts = self.createUniformCluster(chromosome, start, size, strand, i+1) else: transcripts = self.createGaussianCluster(chromosome, start, size, strand, i+1) for transcript in transcripts: writer.addTranscript(transcript) i += 1 progress.inc() progress.done() outputFile.close() writer.write() writer.close() def run(self): self.getSizes() self.writeRegions() if __name__ == "__main__": # parse command line description = "Get Random Regions v1.0.2: Get some random coordinates on a genome. May use uniform or gaussian distribution (in gaussion distribution, # of element per cluster follows a power law). [Category: Other]" parser = OptionParser(description = description) parser.add_option("-r", "--reference", dest="reference", action="store", default=None, type="string", help="file that contains the sequences [format: file in FASTA format]") parser.add_option("-S", "--referenceSize", dest="referenceSize", action="store", default=None, type="int", help="size of the chromosome (when no reference is given) [format: int]") parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="name of the chromosome (when no reference is given) [format: string]") parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in FASTA format]") parser.add_option("-i", "--input", dest="inputFileName", action="store", default=None, type="string", help="optional file containing regions to shuffle [format: file in transcript format given by -f]") parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the previous file [format: transcript file format]") parser.add_option("-s", "--size", dest="size", action="store", default=None, type="int", help="size of the regions (if no region set is provided) [format: int]") parser.add_option("-z", "--minSize", dest="minSize", action="store", default=None, type="int", help="minimum size of the regions (if no region set nor a fixed size are provided) [format: int]") parser.add_option("-Z", "--maxSize", dest="maxSize", action="store", default=None, type="int", help="maximum size of the regions (if no region set nor a fixed size are provided) [format: int]") parser.add_option("-n", "--number", dest="number", action="store", default=None, type="int", help="number of regions (if no region set is provided) [format: int]") parser.add_option("-t", "--strands", dest="strands", action="store_true", default=False, help="use both strands (if no region set is provided) [format: boolean]") parser.add_option("-m", "--max", dest="max", action="store", default=None, type="int", help="max. # reads in a cluster (for Gaussian dist.) [format: int]") parser.add_option("-d", "--deviation", dest="deviation", action="store", default=None, type="int", help="deviation around the center of the cluster (for Gaussian dist.) [format: int]") parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") (options, args) = parser.parse_args() rrg = RandomRegionsGenerator(options.verbosity) if options.reference == None: rrg.setGenomeSize(options.referenceSize) rrg.setChromosomeName(options.chromosome) else: rrg.setInput(options.reference) rrg.setOutputFile(options.outputFileName) if options.inputFileName == None: if options.size != None: rrg.setSize(options.size) else: rrg.setMinSize(options.minSize) rrg.setMaxSize(options.maxSize) rrg.setNumber(options.number) rrg.setStrands(options.strands) else: rrg.setAnnotation(options.inputFileName, options.format) rrg.setMaxDistribution(options.max) rrg.setDeviationDistribution(options.deviation) rrg.run()