diff SMART/Java/Python/getLetterDistribution.py @ 36:44d5973c188c

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 15:02:29 -0400
parents 769e306b7933
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SMART/Java/Python/getLetterDistribution.py	Tue Apr 30 15:02:29 2013 -0400
@@ -0,0 +1,153 @@
+#! /usr/bin/env python
+#
+# Copyright INRA-URGI 2009-2010
+# 
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software. You can use,
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info".
+# 
+# As a counterpart to the access to the source code and rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty and the software's author, the holder of the
+# economic rights, and the successive licensors have only limited
+# liability.
+# 
+# In this respect, the user's attention is drawn to the risks associated
+# with loading, using, modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean that it is complicated to manipulate, and that also
+# therefore means that it is reserved for developers and experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or
+# data to be ensured and, more generally, to use and operate it in the
+# same conditions as regards security.
+# 
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+#
+"""Get the size distribution of a Fasta / BED file"""
+
+import os
+from optparse import OptionParser
+from commons.core.parsing.FastaParser import *
+from SMART.Java.Python.misc.Progress import *
+from SMART.Java.Python.misc.RPlotter import *
+from commons.core.parsing.ParserChooser import ParserChooser
+
+
+def writeCVSfile(outHandler):
+    for pos in range(len(letters)):
+        posTrue = pos +1
+        outHandler.write( "%s;" % (posTrue))
+        for letter in lettersRate:
+            if positionRate[letter].has_key(pos):
+                outHandler.write("%s=%.2f%s;" %(letter, positionRate[letter][pos], "%"))
+            else:
+                outHandler.write("%s=0%s;" % (letter, "%"))
+        outHandler.write("\n")
+
+if __name__ == "__main__":
+
+    # parse command line
+    description = "Get Letter Distribution v1.0.1: Compute the distribution of nucleotides of a set of genomic coordinates. [Category: Visualization]"
+
+    parser = OptionParser(description = description)
+    parser.add_option("-i", "--input",     dest="inputFileName",  action="store",                     type="string", help="input file to be analyzed [compulsory] [format: file in sequence format given by -f]")
+    parser.add_option("-f", "--format",    dest="format",         action="store",                     type="string", help="format of file [format: sequence file format]")
+    parser.add_option("-o", "--output",    dest="outputFileName", action="store",                     type="string", help="output file [compulsory] [format: output file in PNG format]")
+    parser.add_option("-v", "--verbosity", dest="verbosity",      action="store",      default=1,     type="int",    help="trace level [format: int]")
+    parser.add_option("-c", "--csv",       dest="csv",            action="store_true", default=False,                help="write a .csv file [format: bool] [default: false]")
+    parser.add_option("-l", "--log",       dest="log",            action="store_true", default=False,                help="write a log file [format: bool] [default: false]")
+    (options, args) = parser.parse_args()
+
+    chooser = ParserChooser()
+    chooser.findFormat(options.format)    
+    parser      = chooser.getParser(options.inputFileName)
+    nbSequences = parser.getNbSequences()
+    print "%i sequences read" % (nbSequences)
+
+    # treat items
+    progress       = Progress(nbSequences, "Analyzing sequences of " + options.inputFileName, options.verbosity)
+    nbLettersTotal = 0
+    nbLetters      = {}
+    lettersRate    = {}
+    nbPositions    = {}
+    positionCount  = {}
+    positionRate   = {}
+    nbPositionRate = {}
+    for sequence in parser.getIterator():
+        letters            = sequence.getSequence()
+        thisNbLettersTotal = sequence.getSize()
+        nbLettersTotal    += thisNbLettersTotal
+        thisNbLetters      = {}
+        
+        for pos in range(len(letters)):
+            letter = letters[pos]
+            if letter not in thisNbLetters:
+                thisNbLetters[letter] = 1
+            else:
+                thisNbLetters[letter] += 1
+            if pos+1 not in nbPositions:
+                nbPositions[pos+1] = 1
+            else:
+                nbPositions[pos+1] += 1
+            if letter not in positionCount:
+                positionCount[letter] = {}
+            if pos+1 not in positionCount[letter]:
+                positionCount[letter][pos+1] = 1
+            else:
+                positionCount[letter][pos+1] += 1
+
+        for letter in thisNbLetters:
+            if letter not in nbLetters:
+                nbLetters[letter] = thisNbLetters[letter]
+            else:
+                nbLetters[letter] += thisNbLetters[letter]
+            if letter not in lettersRate:
+                lettersRate[letter] = {}
+            rate = int(float(thisNbLetters[letter]) / thisNbLettersTotal * 100)
+            if rate not in lettersRate[letter]:
+                lettersRate[letter][rate] = 1
+            else:
+                lettersRate[letter][rate] += 1
+        progress.inc()
+    progress.done()
+    
+    for letter in positionCount:
+        positionRate[letter] = {}
+        for pos in positionCount[letter]:
+            positionRate[letter][pos] = positionCount[letter][pos] / float(nbPositions[pos]) * 100
+    for pos in nbPositions:
+        nbPositionRate[pos] = nbPositions[pos] / float(nbPositions[1]) * 100
+
+    # plot content distributions
+    plotter = RPlotter("%s.png" % (options.outputFileName), options.verbosity, True)
+    plotter.setFill(0)
+    plotter.setLegend(True)
+    for letter in lettersRate:
+        plotter.addLine(lettersRate[letter], letter)
+    plotter.plot()
+    
+    # plot distribution per position
+    plotter = RPlotter("%sPerNt.png" % (options.outputFileName), options.verbosity, True)
+    plotter.setFill(0)
+    plotter.setLegend(True)
+    plotter.setXLabel("Position on the read")
+    plotter.setYLabel("Percentage")
+    for letter in positionRate:
+        plotter.addLine(positionRate[letter], letter)
+    plotter.addLine(nbPositionRate, "#")
+    plotter.plot()
+
+    if options.csv:
+        outHandler = open("%s.csv" % (options.outputFileName), "w")
+        writeCVSfile(outHandler)
+        outHandler.close() 
+ 
+    print "%d sequences" % (nbSequences)
+    print "%d letters" % (nbLettersTotal)
+    for letter in nbLetters:
+        print "%s: %d (%.2f%%)" % (letter, nbLetters[letter], float(nbLetters[letter]) / nbLettersTotal * 100)