view SMART/Java/Python/getLetterDistribution.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents 769e306b7933
children
line wrap: on
line source

#! /usr/bin/env python
#
# Copyright INRA-URGI 2009-2010
# 
# This software is governed by the CeCILL license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
# 
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
# 
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
# 
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL license and that you accept its terms.
#
"""Get the size distribution of a Fasta / BED file"""

import os
from optparse import OptionParser
from commons.core.parsing.FastaParser import *
from SMART.Java.Python.misc.Progress import *
from SMART.Java.Python.misc.RPlotter import *
from commons.core.parsing.ParserChooser import ParserChooser


def writeCVSfile(outHandler):
    for pos in range(len(letters)):
        posTrue = pos +1
        outHandler.write( "%s;" % (posTrue))
        for letter in lettersRate:
            if positionRate[letter].has_key(pos):
                outHandler.write("%s=%.2f%s;" %(letter, positionRate[letter][pos], "%"))
            else:
                outHandler.write("%s=0%s;" % (letter, "%"))
        outHandler.write("\n")

if __name__ == "__main__":

    # parse command line
    description = "Get Letter Distribution v1.0.1: Compute the distribution of nucleotides of a set of genomic coordinates. [Category: Visualization]"

    parser = OptionParser(description = description)
    parser.add_option("-i", "--input",     dest="inputFileName",  action="store",                     type="string", help="input file to be analyzed [compulsory] [format: file in sequence format given by -f]")
    parser.add_option("-f", "--format",    dest="format",         action="store",                     type="string", help="format of file [format: sequence file format]")
    parser.add_option("-o", "--output",    dest="outputFileName", action="store",                     type="string", help="output file [compulsory] [format: output file in PNG format]")
    parser.add_option("-v", "--verbosity", dest="verbosity",      action="store",      default=1,     type="int",    help="trace level [format: int]")
    parser.add_option("-c", "--csv",       dest="csv",            action="store_true", default=False,                help="write a .csv file [format: bool] [default: false]")
    parser.add_option("-l", "--log",       dest="log",            action="store_true", default=False,                help="write a log file [format: bool] [default: false]")
    (options, args) = parser.parse_args()

    chooser = ParserChooser()
    chooser.findFormat(options.format)    
    parser      = chooser.getParser(options.inputFileName)
    nbSequences = parser.getNbSequences()
    print "%i sequences read" % (nbSequences)

    # treat items
    progress       = Progress(nbSequences, "Analyzing sequences of " + options.inputFileName, options.verbosity)
    nbLettersTotal = 0
    nbLetters      = {}
    lettersRate    = {}
    nbPositions    = {}
    positionCount  = {}
    positionRate   = {}
    nbPositionRate = {}
    for sequence in parser.getIterator():
        letters            = sequence.getSequence()
        thisNbLettersTotal = sequence.getSize()
        nbLettersTotal    += thisNbLettersTotal
        thisNbLetters      = {}
        
        for pos in range(len(letters)):
            letter = letters[pos]
            if letter not in thisNbLetters:
                thisNbLetters[letter] = 1
            else:
                thisNbLetters[letter] += 1
            if pos+1 not in nbPositions:
                nbPositions[pos+1] = 1
            else:
                nbPositions[pos+1] += 1
            if letter not in positionCount:
                positionCount[letter] = {}
            if pos+1 not in positionCount[letter]:
                positionCount[letter][pos+1] = 1
            else:
                positionCount[letter][pos+1] += 1

        for letter in thisNbLetters:
            if letter not in nbLetters:
                nbLetters[letter] = thisNbLetters[letter]
            else:
                nbLetters[letter] += thisNbLetters[letter]
            if letter not in lettersRate:
                lettersRate[letter] = {}
            rate = int(float(thisNbLetters[letter]) / thisNbLettersTotal * 100)
            if rate not in lettersRate[letter]:
                lettersRate[letter][rate] = 1
            else:
                lettersRate[letter][rate] += 1
        progress.inc()
    progress.done()
    
    for letter in positionCount:
        positionRate[letter] = {}
        for pos in positionCount[letter]:
            positionRate[letter][pos] = positionCount[letter][pos] / float(nbPositions[pos]) * 100
    for pos in nbPositions:
        nbPositionRate[pos] = nbPositions[pos] / float(nbPositions[1]) * 100

    # plot content distributions
    plotter = RPlotter("%s.png" % (options.outputFileName), options.verbosity, True)
    plotter.setFill(0)
    plotter.setLegend(True)
    for letter in lettersRate:
        plotter.addLine(lettersRate[letter], letter)
    plotter.plot()
    
    # plot distribution per position
    plotter = RPlotter("%sPerNt.png" % (options.outputFileName), options.verbosity, True)
    plotter.setFill(0)
    plotter.setLegend(True)
    plotter.setXLabel("Position on the read")
    plotter.setYLabel("Percentage")
    for letter in positionRate:
        plotter.addLine(positionRate[letter], letter)
    plotter.addLine(nbPositionRate, "#")
    plotter.plot()

    if options.csv:
        outHandler = open("%s.csv" % (options.outputFileName), "w")
        writeCVSfile(outHandler)
        outHandler.close() 
 
    print "%d sequences" % (nbSequences)
    print "%d letters" % (nbLettersTotal)
    for letter in nbLetters:
        print "%s: %d (%.2f%%)" % (letter, nbLetters[letter], float(nbLetters[letter]) / nbLettersTotal * 100)