Mercurial > repos > yufei-luo > s_mart
comparison SMART/Java/Python/getRandomRegions.py @ 6:769e306b7933
Change the repository level.
author | yufei-luo |
---|---|
date | Fri, 18 Jan 2013 04:54:14 -0500 |
parents | |
children | 169d364ddd91 |
comparison
equal
deleted
inserted
replaced
5:ea3082881bf8 | 6:769e306b7933 |
---|---|
1 #! /usr/bin/env python | |
2 # | |
3 # Copyright INRA-URGI 2009-2010 | |
4 # | |
5 # This software is governed by the CeCILL license under French law and | |
6 # abiding by the rules of distribution of free software. You can use, | |
7 # modify and/ or redistribute the software under the terms of the CeCILL | |
8 # license as circulated by CEA, CNRS and INRIA at the following URL | |
9 # "http://www.cecill.info". | |
10 # | |
11 # As a counterpart to the access to the source code and rights to copy, | |
12 # modify and redistribute granted by the license, users are provided only | |
13 # with a limited warranty and the software's author, the holder of the | |
14 # economic rights, and the successive licensors have only limited | |
15 # liability. | |
16 # | |
17 # In this respect, the user's attention is drawn to the risks associated | |
18 # with loading, using, modifying and/or developing or reproducing the | |
19 # software by the user in light of its specific status of free software, | |
20 # that may mean that it is complicated to manipulate, and that also | |
21 # therefore means that it is reserved for developers and experienced | |
22 # professionals having in-depth computer knowledge. Users are therefore | |
23 # encouraged to load and test the software's suitability as regards their | |
24 # requirements in conditions enabling the security of their systems and/or | |
25 # data to be ensured and, more generally, to use and operate it in the | |
26 # same conditions as regards security. | |
27 # | |
28 # The fact that you are presently reading this means that you have had | |
29 # knowledge of the CeCILL license and that you accept its terms. | |
30 # | |
31 """Find random regions in a genome""" | |
32 | |
33 import random, math | |
34 from optparse import OptionParser | |
35 from commons.core.parsing.FastaParser import * | |
36 from commons.core.writer.Gff3Writer import * | |
37 from commons.core.writer.MySqlTranscriptWriter import * | |
38 from SMART.Java.Python.misc.Progress import * | |
39 from SMART.Java.Python.structure.Transcript import Transcript | |
40 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer | |
41 | |
42 repetitions = 100 | |
43 | |
44 | |
45 class RandomRegionsGenerator(object): | |
46 | |
47 def __init__(self, verbosity): | |
48 self.verbosity = verbosity | |
49 self.strands = False | |
50 self.distribution = "uniform" | |
51 self.transcripts = None | |
52 self.sequenceParser = None | |
53 random.seed() | |
54 | |
55 | |
56 def setInput(self, fileName): | |
57 self.sequenceParser = FastaParser(fileName, self.verbosity) | |
58 | |
59 | |
60 def setGenomeSize(self, size): | |
61 self.genomeSize = size | |
62 | |
63 | |
64 def setChromosomeName(self, name): | |
65 self.chromosomeName = name | |
66 | |
67 | |
68 def setAnnotation(self, fileName, format): | |
69 parser = TranscriptContainer(fileName, format, self.verbosity) | |
70 self.transcripts = [] | |
71 for transcript in parser.getIterator(): | |
72 self.transcripts.append(transcript) | |
73 self.setNumber(len(self.transcripts)) | |
74 self.setSize(0) | |
75 | |
76 | |
77 def setOutputFile(self, fileName): | |
78 self.outputFileName = fileName | |
79 | |
80 | |
81 def setSize(self, size): | |
82 self.minSize = size | |
83 self.maxSize = size | |
84 | |
85 | |
86 def setMinSize(self, size): | |
87 self.minSize = size | |
88 | |
89 | |
90 def setMaxSize(self, size): | |
91 self.maxSize = size | |
92 | |
93 | |
94 def setNumber(self, number): | |
95 self.number = number | |
96 | |
97 | |
98 def setStrands(self, strands): | |
99 self.strands = strands | |
100 | |
101 | |
102 def setMaxDistribution(self, maxElements): | |
103 if maxElements == None: | |
104 return | |
105 self.maxElements = maxElements | |
106 self.distribution = "gaussian" | |
107 | |
108 | |
109 def setDeviationDistribution(self, deviation): | |
110 if deviation == None: | |
111 return | |
112 self.deviation = deviation | |
113 self.distribution = "gaussian" | |
114 | |
115 | |
116 def getSizes(self): | |
117 if self.sequenceParser == None: | |
118 self.chromosomes = [self.chromosomeName] | |
119 self.sizes = {self.chromosomeName: self.genomeSize} | |
120 self.cumulatedSize = self.genomeSize | |
121 self.cumulatedSizes = {self.chromosomeName: self.genomeSize} | |
122 return | |
123 self.chromosomes = self.sequenceParser.getRegions() | |
124 self.sizes = {} | |
125 self.cumulatedSize = 0 | |
126 self.cumulatedSizes = {} | |
127 for chromosome in self.chromosomes: | |
128 self.sizes[chromosome] = self.sequenceParser.getSizeOfRegion(chromosome) | |
129 self.cumulatedSize += self.sizes[chromosome] | |
130 self.cumulatedSizes[chromosome] = self.cumulatedSize | |
131 | |
132 | |
133 def findPosition(self, size = None): | |
134 if size == None: | |
135 size = random.randint(self.minSize, self.maxSize) | |
136 integer = random.randint(0, self.cumulatedSize) | |
137 for chromosome in self.chromosomes: | |
138 if self.cumulatedSizes[chromosome] > integer: | |
139 break | |
140 start = random.randint(1, self.sizes[chromosome] - size) | |
141 return (chromosome, start, size) | |
142 | |
143 | |
144 def createTranscript(self, chromosome, start, size, strand, cpt): | |
145 transcript = Transcript() | |
146 transcript.setChromosome(chromosome) | |
147 transcript.setStart(start) | |
148 transcript.setEnd(start + size-1) | |
149 transcript.setDirection(strand) | |
150 transcript.setName("rand_%d" % (cpt)) | |
151 return transcript | |
152 | |
153 | |
154 def moveTranscript(self, chromosome, start, transcript): | |
155 while transcript.getEnd() + start - transcript.getStart() > self.cumulatedSizes[chromosome]: | |
156 chromosome, start, size = self.findPosition(transcript.getEnd() - transcript.getStart()) | |
157 transcript.setChromosome(chromosome) | |
158 oldStart, oldEnd = transcript.getStart(), transcript.getEnd() | |
159 if transcript.getNbExons() > 1: | |
160 for exon in transcript.getNbExons(): | |
161 oldExonStart, oldExonEnd = exon.getStart(), exon.getEnd() | |
162 exon.setStart(oldExonStart + start - oldStart) | |
163 exon.setEnd(oldExonEnd + start - oldStart) | |
164 transcript.setStart(start) | |
165 transcript.setEnd(oldEnd + start - oldStart) | |
166 return [transcript] | |
167 | |
168 | |
169 def createUniformCluster(self, chromosome, start, size, strand, cpt): | |
170 transcript = self.createTranscript(chromosome, start, size, strand, cpt) | |
171 return [transcript] | |
172 | |
173 | |
174 def findNbTranscripts(self, cpt): | |
175 return min(int(round(math.exp(random.random() * math.log(self.maxElements)))), self.number - cpt + 1) | |
176 | |
177 | |
178 def getDev(self): | |
179 deviation = 0.0 | |
180 for j in range(repetitions): | |
181 deviation += random.randint(-self.deviation, self.deviation) | |
182 deviation /= repetitions | |
183 deviation = int(round(deviation)) | |
184 return deviation | |
185 | |
186 | |
187 def createGaussianCluster(self, chromosome, start, size, strand, cpt): | |
188 transcripts = [] | |
189 nbTranscripts = self.findNbTranscripts(cpt) | |
190 for i in range(nbTranscripts): | |
191 transcript = self.createTranscript(chromosome, start + self.getDev(), size + self.getDev(), strand, cpt + i) | |
192 transcripts.append(transcript) | |
193 return transcripts | |
194 | |
195 | |
196 def writeRegions(self): | |
197 writer = Gff3Writer(self.outputFileName, self.verbosity) | |
198 outputFile = open(self.outputFileName, "w") | |
199 progress = Progress(self.number, "Writing to %s" % (self.outputFileName), self.verbosity) | |
200 i = 0 | |
201 while i < self.number: | |
202 chromosome, start, size = self.findPosition() | |
203 strand = random.choice([-1, 1]) if self.strands else 1 | |
204 if self.transcripts != None: | |
205 transcripts = self.moveTranscript(chromosome, start, self.transcripts[i]) | |
206 elif self.distribution == "uniform": | |
207 transcripts = self.createUniformCluster(chromosome, start, size, strand, i+1) | |
208 else: | |
209 transcripts = self.createGaussianCluster(chromosome, start, size, strand, i+1) | |
210 for transcript in transcripts: | |
211 writer.addTranscript(transcript) | |
212 i += 1 | |
213 progress.inc() | |
214 progress.done() | |
215 outputFile.close() | |
216 writer.write() | |
217 writer.close() | |
218 | |
219 | |
220 def run(self): | |
221 self.getSizes() | |
222 self.writeRegions() | |
223 | |
224 | |
225 if __name__ == "__main__": | |
226 | |
227 # parse command line | |
228 description = "Get Random Regions v1.0.2: Get some random coordinates on a genome. May use uniform or gaussian distribution (in gaussion distribution, # of element per cluster follows a power law). [Category: Other]" | |
229 | |
230 parser = OptionParser(description = description) | |
231 parser.add_option("-r", "--reference", dest="reference", action="store", default=None, type="string", help="file that contains the sequences [format: file in FASTA format]") | |
232 parser.add_option("-S", "--referenceSize", dest="referenceSize", action="store", default=None, type="int", help="size of the chromosome (when no reference is given) [format: int]") | |
233 parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="name of the chromosome (when no reference is given) [format: string]") | |
234 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in FASTA format]") | |
235 parser.add_option("-i", "--input", dest="inputFileName", action="store", default=None, type="string", help="optional file containing regions to shuffle [format: file in transcript format given by -f]") | |
236 parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the previous file [format: transcript file format]") | |
237 parser.add_option("-s", "--size", dest="size", action="store", default=None, type="int", help="size of the regions (if no region set is provided) [format: int]") | |
238 parser.add_option("-z", "--minSize", dest="minSize", action="store", default=None, type="int", help="minimum size of the regions (if no region set nor a fixed size are provided) [format: int]") | |
239 parser.add_option("-Z", "--maxSize", dest="maxSize", action="store", default=None, type="int", help="maximum size of the regions (if no region set nor a fixed size are provided) [format: int]") | |
240 parser.add_option("-n", "--number", dest="number", action="store", default=None, type="int", help="number of regions (if no region set is provided) [format: int]") | |
241 parser.add_option("-t", "--strands", dest="strands", action="store_true", default=False, help="use both strands (if no region set is provided) [format: boolean]") | |
242 parser.add_option("-m", "--max", dest="max", action="store", default=None, type="int", help="max. # reads in a cluster (for Gaussian dist.) [format: int]") | |
243 parser.add_option("-d", "--deviation", dest="deviation", action="store", default=None, type="int", help="deviation around the center of the cluster (for Gaussian dist.) [format: int]") | |
244 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") | |
245 (options, args) = parser.parse_args() | |
246 | |
247 rrg = RandomRegionsGenerator(options.verbosity) | |
248 if options.reference == None: | |
249 rrg.setGenomeSize(options.referenceSize) | |
250 rrg.setChromosomeName(options.chromosome) | |
251 else: | |
252 rrg.setInput(options.reference) | |
253 rrg.setOutputFile(options.outputFileName) | |
254 if options.inputFileName == None: | |
255 if options.size != None: | |
256 rrg.setSize(options.size) | |
257 else: | |
258 rrg.setMinSize(options.minSize) | |
259 rrg.setMaxSize(options.maxSize) | |
260 rrg.setNumber(options.number) | |
261 rrg.setStrands(options.strands) | |
262 else: | |
263 rrg.setAnnotation(options.inputFileName, options.format) | |
264 rrg.setMaxDistribution(options.max) | |
265 rrg.setDeviationDistribution(options.deviation) | |
266 rrg.run() | |
267 |