comparison smart_toolShed/SMART/Java/Python/getDistribution.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e0f8dcca02ed
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2010
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 """Get the repartition of some elements in a chromosomes"""
32
33 import os
34 from optparse import OptionParser
35 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
36 from SMART.Java.Python.structure.Transcript import Transcript
37 from commons.core.writer.Gff3Writer import Gff3Writer
38 from SMART.Java.Python.misc.RPlotter import RPlotter
39 from SMART.Java.Python.misc.Progress import Progress
40 from math import *
41
42 def divideKeyDict(dictionary, ratio):
43 return dict([(key / ratio, dictionary[key]) for key in dictionary])
44
45
46 def setTranscript(chromosome, direction, start, end, name, value):
47 transcript = Transcript()
48 transcript.setChromosome(chromosome)
49 transcript.setDirection(direction)
50 transcript.setStart(start)
51 transcript.setEnd(end)
52 transcript.setName(name)
53 transcript.setTagValue("nbElements", value)
54 return transcript
55
56
57
58 if __name__ == "__main__":
59
60 magnifyingFactor = 1000
61
62 # parse command line
63 description = "Get Distribution v1.0.1: Get the distribution of the genomic coordinates on a genome. [Category: Visualization]"
64
65 parser = OptionParser(description = description)
66 parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]")
67 parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]")
68 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]")
69 parser.add_option("-r", "--reference", dest="referenceFileName", action="store", default=None, type="string", help="file containing the genome [compulsory] [format: file in FASTA format]")
70 parser.add_option("-n", "--nbBins", dest="nbBins", action="store", default=1000, type="int", help="number of bins [default: 1000] [format: int]")
71 parser.add_option("-2", "--bothStrands", dest="bothStrands", action="store_true", default=False, help="plot one curve per strand [format: bool] [default: false]")
72 parser.add_option("-w", "--raw", dest="raw", action="store_true", default=False, help="plot raw number of occurrences instead of density [format: bool] [default: false]")
73 parser.add_option("-x", "--csv", dest="csv", action="store_true", default=False, help="write a .csv file [format: bool]")
74 parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="plot only a chromosome [format: string]")
75 parser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="start from a given region [format: int]")
76 parser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="end from a given region [format: int]")
77 parser.add_option("-y", "--yMin", dest="yMin", action="store", default=None, type="int", help="minimum value on the y-axis to plot [format: int]")
78 parser.add_option("-Y", "--yMax", dest="yMax", action="store", default=None, type="int", help="maximum value on the y-axis to plot [format: int]")
79 parser.add_option("-g", "--gff", dest="gff", action="store_true", default=False, help="also write GFF3 file [format: bool] [default: false]")
80 parser.add_option("-H", "--height", dest="height", action="store", default=None, type="int", help="height of the graphics [format: int] [default: 300]")
81 parser.add_option("-W", "--width", dest="width", action="store", default=None, type="int", help="width of the graphics [format: int] [default: 1000]")
82 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]")
83 parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool]")
84 parser.add_option("-D", "--directory", dest="working_Dir", action="store", default=os.getcwd(), type="string", help="the directory to store the results [format: directory]")
85 (options, args) = parser.parse_args()
86
87 sizes = {}
88 if options.referenceFileName != None:
89 # get the sizes of the chromosomes
90 referenceHandle = open(options.referenceFileName)
91 name = None
92 size = 0
93 maxSize = 0
94 for line in referenceHandle:
95 line = line.strip()
96 if line == "": continue
97 if line[0] == ">":
98 if name != None:
99 if options.verbosity > 10:
100 print name
101 sizes[name] = size
102 maxSize = max(maxSize, size)
103 size = 0
104 name = line[1:]
105 else:
106 size += len(line)
107 sizes[name] = size
108 maxSize = max(maxSize, size)
109 if options.verbosity > 1:
110 print "done"
111 start = 0
112 end = maxSize
113 else:
114 if options.chromosome == None or options.start == None or options.end == None:
115 raise Exception("Missing chromosome or start and end positions, or reference file")
116 maxSize = options.end
117 sizes[options.chromosome] = options.end
118 start = options.start
119 end = options.end
120
121
122 tmp1 = int(maxSize / float(options.nbBins))
123 tmp2 = 10 ** (len("%d" % (tmp1))-2)
124 sliceSize = int((tmp1 / tmp2) * tmp2)
125
126 bins = dict()
127 binsPlus = dict()
128 binsMinus = dict()
129 for chromosome in sizes:
130 bins[chromosome] = dict([(i * sliceSize + 1, 0) for i in range(start / sliceSize, sizes[chromosome] / sliceSize + 1)])
131 binsPlus[chromosome] = dict([(i * sliceSize + 1, 0) for i in range(start / sliceSize, sizes[chromosome] / sliceSize + 1)])
132 binsMinus[chromosome] = dict([(i * sliceSize + 1, 0) for i in range(start / sliceSize, sizes[chromosome] / sliceSize + 1)])
133
134 parser = TranscriptContainer(options.inputFileName, options.format, options.verbosity)
135 progress = Progress(parser.getNbTranscripts(), "Parsing %s" % (options.inputFileName), options.verbosity)
136 maxSlice = 0
137 # count the number of reads
138 for transcript in parser.getIterator():
139 if options.chromosome == None or (transcript.getChromosome() == options.chromosome and transcript.getStart() >= start and transcript.getStart() <= end):
140 if transcript.getDirection() == 1:
141 binsPlus[transcript.getChromosome()][(transcript.getStart() / sliceSize) * sliceSize + 1] += 1
142 else:
143 binsMinus[transcript.getChromosome()][(transcript.getStart() / sliceSize) * sliceSize + 1] += 1
144 bins[transcript.getChromosome()][(transcript.getStart() / sliceSize) * sliceSize + 1] += 1
145 maxSlice = max(maxSlice, transcript.getStart() / sliceSize)
146 progress.inc()
147 progress.done()
148
149 # compute densities
150 densityPlus = dict()
151 for chromosome in bins:
152 densityPlus[chromosome] = dict([(bin, 0) for bin in binsPlus[chromosome]])
153 for bin in binsPlus[chromosome]:
154 densityPlus[chromosome][bin] = float(binsPlus[chromosome][bin]) / sliceSize * magnifyingFactor
155 # correct densities for first and last bins
156 if start % sliceSize != 0:
157 densityPlus[chromosome][(start / sliceSize) * sliceSize + 1] = float(binsPlus[chromosome][(start / sliceSize) * sliceSize + 1]) / (sliceSize - (start % sliceSize)) * magnifyingFactor
158 if sizes[chromosome] % sliceSize != 0:
159 densityPlus[chromosome][(sizes[chromosome] / sliceSize) * sliceSize + 1] = float(binsPlus[chromosome][(sizes[chromosome] / sliceSize) * sliceSize + 1]) / (sizes[chromosome] % sliceSize) * magnifyingFactor
160 densityMinus = dict()
161 for chromosome in binsMinus:
162 densityMinus[chromosome] = dict([(bin, 0) for bin in binsMinus[chromosome]])
163 for bin in binsMinus[chromosome]:
164 densityMinus[chromosome][bin] = float(binsMinus[chromosome][bin]) / sliceSize * magnifyingFactor
165 # correct densities for first and last bins
166 if start % sliceSize != 0:
167 densityMinus[chromosome][(start / sliceSize) * sliceSize + 1] = float(binsMinus[chromosome][(start / sliceSize) * sliceSize + 1]) / (sliceSize - (start % sliceSize)) * magnifyingFactor
168 if sizes[chromosome] % sliceSize != 0:
169 densityMinus[chromosome][(sizes[chromosome] / sliceSize) * sliceSize + 1] = float(binsMinus[chromosome][(sizes[chromosome] / sliceSize) * sliceSize + 1]) / (sizes[chromosome] % sliceSize) * magnifyingFactor
170 density = dict()
171 for chromosome in bins:
172 density[chromosome] = dict([(bin, 0) for bin in bins[chromosome]])
173 for bin in bins[chromosome]:
174 density[chromosome][bin] = densityPlus[chromosome][bin] + densityMinus[chromosome][bin]
175
176 for chromosome in densityMinus:
177 for bin in densityMinus[chromosome]:
178 densityMinus[chromosome][bin] *= -1
179 for bin in binsMinus[chromosome]:
180 binsMinus[chromosome][bin] *= -1
181
182 for chromosome in density:
183 maxX = max(bins[chromosome].keys())
184 if maxX <= 1000:
185 unit = "nt."
186 ratio = 1.0
187 elif maxX <= 1000000:
188 unit = "kb"
189 ratio = 1000.0
190 else:
191 unit = "Mb"
192 ratio = 1000000.0
193 outputFileName = "%s_%s" % (options.outputFileName, chromosome)
194 if options.start != None and options.end != None:
195 outputFileName += ":%d-%d" % (options.start, options.end)
196 outputFileName += ".png"
197 plotter = RPlotter(outputFileName, options.verbosity)
198 plotter.setXLabel("Position on %s (in %s)" % (chromosome.replace("_", " "), unit))
199 plotter.setYLabel("# reads")
200 if options.bothStrands:
201 plotter.setImageSize(1000, 300)
202 else:
203 plotter.setImageSize(1000, 200)
204 if options.height != None:
205 plotter.setHeight(options.height)
206 if options.width != None:
207 plotter.setWidth(options.width)
208 if options.yMax != None:
209 plotter.setMinimumY(options.yMin)
210 if options.yMax != None:
211 plotter.setMaximumY(options.yMax)
212 if options.bothStrands :
213 if options.raw:
214 plotter.addLine(divideKeyDict(binsPlus[chromosome], ratio))
215 else:
216 plotter.addLine(divideKeyDict(densityPlus[chromosome], ratio))
217 if options.raw:
218 plotter.addLine(divideKeyDict(binsMinus[chromosome], ratio))
219 else:
220 plotter.addLine(divideKeyDict(densityMinus[chromosome], ratio))
221 else:
222 if options.raw:
223 plotter.addLine(divideKeyDict(bins[chromosome], ratio))
224 else:
225 plotter.addLine(divideKeyDict(density[chromosome], ratio))
226 plotter.plot()
227
228 if options.csv:
229 outputFileName = "%s" % (options.outputFileName)
230 if options.chromosome != None:
231 outputFileName += "_%s" % (options.chromosome)
232 if options.start != None and options.end != None:
233 outputFileName += ":%d-%d" % (options.start, options.end)
234 outputFileName += ".csv"
235 csvHandle = open(outputFileName, "w")
236 for slice in range(start / sliceSize, maxSlice + 1):
237 csvHandle.write(";%d-%d" % (slice * sliceSize + 1, (slice+1) * sliceSize))
238 csvHandle.write("\n")
239 if options.bothStrands:
240 for chromosome in densityPlus:
241 if len(densityPlus[chromosome]) > 0:
242 csvHandle.write("%s [+]" % (chromosome))
243 for slice in sorted(densityPlus[chromosome].keys()):
244 csvHandle.write(";%.2f" % (densityPlus[chromosome][slice]))
245 csvHandle.write("\n")
246 if len(densityMinus[chromosome]) > 0:
247 csvHandle.write("%s [-]" % (chromosome))
248 for slice in sorted(densityPlus[chromosome].keys()):
249 csvHandle.write(";%.2f" % (-densityMinus[chromosome][slice]))
250 csvHandle.write("\n")
251 else:
252 for chromosome in density:
253 if len(density[chromosome]) > 0:
254 csvHandle.write(chromosome)
255 for slice in sorted(density[chromosome].keys()):
256 csvHandle.write(";%.2f" % (density[chromosome][slice]))
257 csvHandle.write("\n")
258 csvHandle.close()
259
260 if options.gff:
261 chromosome = "" if options.chromosome == None else options.chromosome.capitalize()
262 start = "" if options.start == None else "%d" % (options.start)
263 end = "" if options.end == None else "%d" % (options.end)
264 link1 = "" if options.start == None and options.end == None else ":"
265 link2 = "" if options.start == None and options.end == None else "-"
266 writer = Gff3Writer("%s%s%s%s%s.gff3" % (options.outputFileName, link1, start, link2, end), options.verbosity)
267 cpt = 1
268 if options.raw:
269 valuesPlus = binsPlus
270 valuesMinus = binsMinus
271 values = bins
272 else:
273 valuesPlus = densityPlus
274 valuesMinus = densityMinus
275 values = density
276 if options.bothStrands:
277 for chromosome in values:
278 for slice in valuesPlus[chromosome]:
279 writer.addTranscript(setTranscript(chromosome, 1, slice, slice + sliceSize, "region%d" % (cpt), valuesPlus[chromosome][slice]))
280 cpt += 1
281 for slice in valuesMinus[chromosome]:
282 writer.addTranscript(setTranscript(chromosome, -1, slice, slice + sliceSize, "region%d" % (cpt), - valuesMinus[chromosome][slice]))
283 cpt += 1
284 else:
285 for chromosome in values:
286 for slice in values[chromosome]:
287 writer.addTranscript(setTranscript(chromosome, 1, slice, slice + sliceSize, "region%d" % (cpt), values[chromosome][slice]))
288 cpt += 1
289 writer.write()
290
291