Mercurial > repos > yufei-luo > s_mart
diff smart_toolShed/SMART/Java/Python/getDistribution.py @ 0:e0f8dcca02ed
Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author | yufei-luo |
---|---|
date | Thu, 17 Jan 2013 10:52:14 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/getDistribution.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,291 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +"""Get the repartition of some elements in a chromosomes""" + +import os +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.structure.Transcript import Transcript +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.Progress import Progress +from math import * + +def divideKeyDict(dictionary, ratio): + return dict([(key / ratio, dictionary[key]) for key in dictionary]) + + +def setTranscript(chromosome, direction, start, end, name, value): + transcript = Transcript() + transcript.setChromosome(chromosome) + transcript.setDirection(direction) + transcript.setStart(start) + transcript.setEnd(end) + transcript.setName(name) + transcript.setTagValue("nbElements", value) + return transcript + + + +if __name__ == "__main__": + + magnifyingFactor = 1000 + + # parse command line + description = "Get Distribution v1.0.1: Get the distribution of the genomic coordinates on a genome. [Category: Visualization]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-r", "--reference", dest="referenceFileName", action="store", default=None, type="string", help="file containing the genome [compulsory] [format: file in FASTA format]") + parser.add_option("-n", "--nbBins", dest="nbBins", action="store", default=1000, type="int", help="number of bins [default: 1000] [format: int]") + parser.add_option("-2", "--bothStrands", dest="bothStrands", action="store_true", default=False, help="plot one curve per strand [format: bool] [default: false]") + parser.add_option("-w", "--raw", dest="raw", action="store_true", default=False, help="plot raw number of occurrences instead of density [format: bool] [default: false]") + parser.add_option("-x", "--csv", dest="csv", action="store_true", default=False, help="write a .csv file [format: bool]") + parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="plot only a chromosome [format: string]") + parser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="start from a given region [format: int]") + parser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="end from a given region [format: int]") + parser.add_option("-y", "--yMin", dest="yMin", action="store", default=None, type="int", help="minimum value on the y-axis to plot [format: int]") + parser.add_option("-Y", "--yMax", dest="yMax", action="store", default=None, type="int", help="maximum value on the y-axis to plot [format: int]") + parser.add_option("-g", "--gff", dest="gff", action="store_true", default=False, help="also write GFF3 file [format: bool] [default: false]") + parser.add_option("-H", "--height", dest="height", action="store", default=None, type="int", help="height of the graphics [format: int] [default: 300]") + parser.add_option("-W", "--width", dest="width", action="store", default=None, type="int", help="width of the graphics [format: int] [default: 1000]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") + parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool]") + parser.add_option("-D", "--directory", dest="working_Dir", action="store", default=os.getcwd(), type="string", help="the directory to store the results [format: directory]") + (options, args) = parser.parse_args() + + sizes = {} + if options.referenceFileName != None: + # get the sizes of the chromosomes + referenceHandle = open(options.referenceFileName) + name = None + size = 0 + maxSize = 0 + for line in referenceHandle: + line = line.strip() + if line == "": continue + if line[0] == ">": + if name != None: + if options.verbosity > 10: + print name + sizes[name] = size + maxSize = max(maxSize, size) + size = 0 + name = line[1:] + else: + size += len(line) + sizes[name] = size + maxSize = max(maxSize, size) + if options.verbosity > 1: + print "done" + start = 0 + end = maxSize + else: + if options.chromosome == None or options.start == None or options.end == None: + raise Exception("Missing chromosome or start and end positions, or reference file") + maxSize = options.end + sizes[options.chromosome] = options.end + start = options.start + end = options.end + + + tmp1 = int(maxSize / float(options.nbBins)) + tmp2 = 10 ** (len("%d" % (tmp1))-2) + sliceSize = int((tmp1 / tmp2) * tmp2) + + bins = dict() + binsPlus = dict() + binsMinus = dict() + for chromosome in sizes: + bins[chromosome] = dict([(i * sliceSize + 1, 0) for i in range(start / sliceSize, sizes[chromosome] / sliceSize + 1)]) + binsPlus[chromosome] = dict([(i * sliceSize + 1, 0) for i in range(start / sliceSize, sizes[chromosome] / sliceSize + 1)]) + binsMinus[chromosome] = dict([(i * sliceSize + 1, 0) for i in range(start / sliceSize, sizes[chromosome] / sliceSize + 1)]) + + parser = TranscriptContainer(options.inputFileName, options.format, options.verbosity) + progress = Progress(parser.getNbTranscripts(), "Parsing %s" % (options.inputFileName), options.verbosity) + maxSlice = 0 + # count the number of reads + for transcript in parser.getIterator(): + if options.chromosome == None or (transcript.getChromosome() == options.chromosome and transcript.getStart() >= start and transcript.getStart() <= end): + if transcript.getDirection() == 1: + binsPlus[transcript.getChromosome()][(transcript.getStart() / sliceSize) * sliceSize + 1] += 1 + else: + binsMinus[transcript.getChromosome()][(transcript.getStart() / sliceSize) * sliceSize + 1] += 1 + bins[transcript.getChromosome()][(transcript.getStart() / sliceSize) * sliceSize + 1] += 1 + maxSlice = max(maxSlice, transcript.getStart() / sliceSize) + progress.inc() + progress.done() + + # compute densities + densityPlus = dict() + for chromosome in bins: + densityPlus[chromosome] = dict([(bin, 0) for bin in binsPlus[chromosome]]) + for bin in binsPlus[chromosome]: + densityPlus[chromosome][bin] = float(binsPlus[chromosome][bin]) / sliceSize * magnifyingFactor + # correct densities for first and last bins + if start % sliceSize != 0: + densityPlus[chromosome][(start / sliceSize) * sliceSize + 1] = float(binsPlus[chromosome][(start / sliceSize) * sliceSize + 1]) / (sliceSize - (start % sliceSize)) * magnifyingFactor + if sizes[chromosome] % sliceSize != 0: + densityPlus[chromosome][(sizes[chromosome] / sliceSize) * sliceSize + 1] = float(binsPlus[chromosome][(sizes[chromosome] / sliceSize) * sliceSize + 1]) / (sizes[chromosome] % sliceSize) * magnifyingFactor + densityMinus = dict() + for chromosome in binsMinus: + densityMinus[chromosome] = dict([(bin, 0) for bin in binsMinus[chromosome]]) + for bin in binsMinus[chromosome]: + densityMinus[chromosome][bin] = float(binsMinus[chromosome][bin]) / sliceSize * magnifyingFactor + # correct densities for first and last bins + if start % sliceSize != 0: + densityMinus[chromosome][(start / sliceSize) * sliceSize + 1] = float(binsMinus[chromosome][(start / sliceSize) * sliceSize + 1]) / (sliceSize - (start % sliceSize)) * magnifyingFactor + if sizes[chromosome] % sliceSize != 0: + densityMinus[chromosome][(sizes[chromosome] / sliceSize) * sliceSize + 1] = float(binsMinus[chromosome][(sizes[chromosome] / sliceSize) * sliceSize + 1]) / (sizes[chromosome] % sliceSize) * magnifyingFactor + density = dict() + for chromosome in bins: + density[chromosome] = dict([(bin, 0) for bin in bins[chromosome]]) + for bin in bins[chromosome]: + density[chromosome][bin] = densityPlus[chromosome][bin] + densityMinus[chromosome][bin] + + for chromosome in densityMinus: + for bin in densityMinus[chromosome]: + densityMinus[chromosome][bin] *= -1 + for bin in binsMinus[chromosome]: + binsMinus[chromosome][bin] *= -1 + + for chromosome in density: + maxX = max(bins[chromosome].keys()) + if maxX <= 1000: + unit = "nt." + ratio = 1.0 + elif maxX <= 1000000: + unit = "kb" + ratio = 1000.0 + else: + unit = "Mb" + ratio = 1000000.0 + outputFileName = "%s_%s" % (options.outputFileName, chromosome) + if options.start != None and options.end != None: + outputFileName += ":%d-%d" % (options.start, options.end) + outputFileName += ".png" + plotter = RPlotter(outputFileName, options.verbosity) + plotter.setXLabel("Position on %s (in %s)" % (chromosome.replace("_", " "), unit)) + plotter.setYLabel("# reads") + if options.bothStrands: + plotter.setImageSize(1000, 300) + else: + plotter.setImageSize(1000, 200) + if options.height != None: + plotter.setHeight(options.height) + if options.width != None: + plotter.setWidth(options.width) + if options.yMax != None: + plotter.setMinimumY(options.yMin) + if options.yMax != None: + plotter.setMaximumY(options.yMax) + if options.bothStrands : + if options.raw: + plotter.addLine(divideKeyDict(binsPlus[chromosome], ratio)) + else: + plotter.addLine(divideKeyDict(densityPlus[chromosome], ratio)) + if options.raw: + plotter.addLine(divideKeyDict(binsMinus[chromosome], ratio)) + else: + plotter.addLine(divideKeyDict(densityMinus[chromosome], ratio)) + else: + if options.raw: + plotter.addLine(divideKeyDict(bins[chromosome], ratio)) + else: + plotter.addLine(divideKeyDict(density[chromosome], ratio)) + plotter.plot() + + if options.csv: + outputFileName = "%s" % (options.outputFileName) + if options.chromosome != None: + outputFileName += "_%s" % (options.chromosome) + if options.start != None and options.end != None: + outputFileName += ":%d-%d" % (options.start, options.end) + outputFileName += ".csv" + csvHandle = open(outputFileName, "w") + for slice in range(start / sliceSize, maxSlice + 1): + csvHandle.write(";%d-%d" % (slice * sliceSize + 1, (slice+1) * sliceSize)) + csvHandle.write("\n") + if options.bothStrands: + for chromosome in densityPlus: + if len(densityPlus[chromosome]) > 0: + csvHandle.write("%s [+]" % (chromosome)) + for slice in sorted(densityPlus[chromosome].keys()): + csvHandle.write(";%.2f" % (densityPlus[chromosome][slice])) + csvHandle.write("\n") + if len(densityMinus[chromosome]) > 0: + csvHandle.write("%s [-]" % (chromosome)) + for slice in sorted(densityPlus[chromosome].keys()): + csvHandle.write(";%.2f" % (-densityMinus[chromosome][slice])) + csvHandle.write("\n") + else: + for chromosome in density: + if len(density[chromosome]) > 0: + csvHandle.write(chromosome) + for slice in sorted(density[chromosome].keys()): + csvHandle.write(";%.2f" % (density[chromosome][slice])) + csvHandle.write("\n") + csvHandle.close() + + if options.gff: + chromosome = "" if options.chromosome == None else options.chromosome.capitalize() + start = "" if options.start == None else "%d" % (options.start) + end = "" if options.end == None else "%d" % (options.end) + link1 = "" if options.start == None and options.end == None else ":" + link2 = "" if options.start == None and options.end == None else "-" + writer = Gff3Writer("%s%s%s%s%s.gff3" % (options.outputFileName, link1, start, link2, end), options.verbosity) + cpt = 1 + if options.raw: + valuesPlus = binsPlus + valuesMinus = binsMinus + values = bins + else: + valuesPlus = densityPlus + valuesMinus = densityMinus + values = density + if options.bothStrands: + for chromosome in values: + for slice in valuesPlus[chromosome]: + writer.addTranscript(setTranscript(chromosome, 1, slice, slice + sliceSize, "region%d" % (cpt), valuesPlus[chromosome][slice])) + cpt += 1 + for slice in valuesMinus[chromosome]: + writer.addTranscript(setTranscript(chromosome, -1, slice, slice + sliceSize, "region%d" % (cpt), - valuesMinus[chromosome][slice])) + cpt += 1 + else: + for chromosome in values: + for slice in values[chromosome]: + writer.addTranscript(setTranscript(chromosome, 1, slice, slice + sliceSize, "region%d" % (cpt), values[chromosome][slice])) + cpt += 1 + writer.write() + +