s_mart: smart_toolShed/SMART/Java/Python/getLetterDistribution.py comparison

comparison smart_toolShed/SMART/Java/Python/getLetterDistribution.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.

author	yufei-luo
date	Thu, 17 Jan 2013 10:52:14 -0500
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:e0f8dcca02ed
+#! /usr/bin/env python
+#
+# Copyright INRA-URGI 2009-2010
+#
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software. You can use,
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info".
+#
+# As a counterpart to the access to the source code and rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty and the software's author, the holder of the
+# economic rights, and the successive licensors have only limited
+# liability.
+#
+# In this respect, the user's attention is drawn to the risks associated
+# with loading, using, modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean that it is complicated to manipulate, and that also
+# therefore means that it is reserved for developers and experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or
+# data to be ensured and, more generally, to use and operate it in the
+# same conditions as regards security.
+#
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+#
+"""Get the size distribution of a Fasta / BED file"""
+import os
+from optparse import OptionParser
+from commons.core.parsing.FastaParser import *
+from SMART.Java.Python.misc.Progress import *
+from SMART.Java.Python.misc.RPlotter import *
+from commons.core.parsing.ParserChooser import ParserChooser
+def writeCVSfile(outHandler):
+for pos in range(len(letters)):
+posTrue = pos +1
+outHandler.write( "%s;" % (posTrue))
+for letter in lettersRate:
+if positionRate[letter].has_key(pos):
+outHandler.write("%s=%.2f%s;" %(letter, positionRate[letter][pos], "%"))
+else:
+outHandler.write("%s=0%s;" % (letter, "%"))
+outHandler.write("\n")
+if __name__ == "__main__":
+# parse command line
+description = "Get Letter Distribution v1.0.1: Compute the distribution of nucleotides of a set of genomic coordinates. [Category: Visualization]"
+parser = OptionParser(description = description)
+parser.add_option("-i", "--input",     dest="inputFileName",  action="store",                     type="string", help="input file to be analyzed [compulsory] [format: file in sequence format given by -f]")
+parser.add_option("-f", "--format",    dest="format",         action="store",                     type="string", help="format of file [format: sequence file format]")
+parser.add_option("-o", "--output",    dest="outputFileName", action="store",                     type="string", help="output file [compulsory] [format: output file in PNG format]")
+parser.add_option("-v", "--verbosity", dest="verbosity",      action="store",      default=1,     type="int",    help="trace level [format: int]")
+parser.add_option("-c", "--csv",       dest="csv",            action="store_true", default=False,                help="write a .csv file [format: bool] [default: false]")
+parser.add_option("-l", "--log",       dest="log",            action="store_true", default=False,                help="write a log file [format: bool] [default: false]")
+(options, args) = parser.parse_args()
+chooser = ParserChooser()
+chooser.findFormat(options.format)
+parser      = chooser.getParser(options.inputFileName)
+nbSequences = parser.getNbSequences()
+print "%i sequences read" % (nbSequences)
+# treat items
+progress       = Progress(nbSequences, "Analyzing sequences of " + options.inputFileName, options.verbosity)
+nbLettersTotal = 0
+nbLetters      = {}
+lettersRate    = {}
+nbPositions    = {}
+positionCount  = {}
+positionRate   = {}
+nbPositionRate = {}
+for sequence in parser.getIterator():
+letters            = sequence.getSequence()
+thisNbLettersTotal = sequence.getSize()
+nbLettersTotal    += thisNbLettersTotal
+thisNbLetters      = {}
+for pos in range(len(letters)):
+letter = letters[pos]
+if letter not in thisNbLetters:
+thisNbLetters[letter] = 1
+else:
+thisNbLetters[letter] += 1
+if pos+1 not in nbPositions:
+nbPositions[pos+1] = 1
+else:
+nbPositions[pos+1] += 1
+if letter not in positionCount:
+positionCount[letter] = {}
+if pos+1 not in positionCount[letter]:
+positionCount[letter][pos+1] = 1
+else:
+positionCount[letter][pos+1] += 1
+for letter in thisNbLetters:
+if letter not in nbLetters:
+nbLetters[letter] = thisNbLetters[letter]
+else:
+nbLetters[letter] += thisNbLetters[letter]
+if letter not in lettersRate:
+lettersRate[letter] = {}
+rate = int(float(thisNbLetters[letter]) / thisNbLettersTotal * 100)
+if rate not in lettersRate[letter]:
+lettersRate[letter][rate] = 1
+else:
+lettersRate[letter][rate] += 1
+progress.inc()
+progress.done()
+for letter in positionCount:
+positionRate[letter] = {}
+for pos in positionCount[letter]:
+positionRate[letter][pos] = positionCount[letter][pos] / float(nbPositions[pos]) * 100
+for pos in nbPositions:
+nbPositionRate[pos] = nbPositions[pos] / float(nbPositions[1]) * 100
+# plot content distributions
+plotter = RPlotter("%s.png" % (options.outputFileName), options.verbosity, True)
+plotter.setFill(0)
+plotter.setLegend(True)
+for letter in lettersRate:
+plotter.addLine(lettersRate[letter], letter)
+plotter.plot()
+# plot distribution per position
+plotter = RPlotter("%sPerNt.png" % (options.outputFileName), options.verbosity, True)
+plotter.setFill(0)
+plotter.setLegend(True)
+plotter.setXLabel("Position on the read")
+plotter.setYLabel("Percentage")
+for letter in positionRate:
+plotter.addLine(positionRate[letter], letter)
+plotter.addLine(nbPositionRate, "#")
+plotter.plot()
+if options.csv:
+outHandler = open("%s.csv" % (options.outputFileName), "w")
+writeCVSfile(outHandler)
+outHandler.close()
+print "%d sequences" % (nbSequences)
+print "%d letters" % (nbLettersTotal)
+for letter in nbLetters:
+print "%s: %d (%.2f%%)" % (letter, nbLetters[letter], float(nbLetters[letter]) / nbLettersTotal * 100)

Mercurial > repos > yufei-luo > s_mart

comparison smart_toolShed/SMART/Java/Python/getLetterDistribution.py @ 0:e0f8dcca02ed