view SMART/Java/Python/CountReadGCPercent.py @ 11:2da30502c2f1

Updated CompareOverlappingSmallQuery.xml
author m-zytnicki
date Thu, 14 Mar 2013 05:37:08 -0400
parents 769e306b7933
children
line wrap: on
line source

#!/usr/bin/env python

from optparse import OptionParser
from commons.core.parsing.FastaParser import FastaParser
from commons.core.writer.Gff3Writer import Gff3Writer
from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
from SMART.Java.Python.misc.Progress import Progress
from commons.core.utils.RepetOptionParser import RepetOptionParser
from Gnome_tools.CountGCPercentBySlidingWindow import CountGCPercentBySlidingWindow


class CountReadGCPercent(object):
    
    def __init__(self):
        self.referenceReader = None
        self.gffReader = None
        self.outputWriter = None
        self.verbose = 0
        
    def setInputReferenceFile(self, fileName):
        self.referenceReader = fileName
 
    def setInputGffFile(self, fileName):
        self.gffReader = TranscriptContainer(fileName, 'gff3', self.verbose)
        
    def setOutputFileName(self, fileName):
        self.outputWriter = Gff3Writer(fileName, self.verbose)

    def readGffAnnotation(self):
        self.coveredRegions = {}
        progress = Progress(self.gffReader.getNbTranscripts(), "Reading gff3 annotation file", self.verbose)
        for transcript in self.gffReader.getIterator():
            chromosome = transcript.getChromosome()
            if chromosome not in self.coveredRegions:
                self.coveredRegions[chromosome] = {}
            for exon in transcript.getExons():
                for position in range(exon.getStart(), exon.getEnd()+1):
                    self.coveredRegions[chromosome][position] = 1
            progress.inc()
        progress.done()
        
    def write(self):
        iParser = FastaParser(self.referenceReader)
        iParser.setTags()
        iGetGCPercentBySW = CountGCPercentBySlidingWindow()
        progress = Progress(self.gffReader.getNbTranscripts(), "Writing output file", self.verbose)
        for transcript in self.gffReader.getIterator():
            chromosome = transcript.getChromosome()
            GCpercent = 0
            nPercent = 0
            for exon in transcript.getExons():
                    for sequenceName in iParser.getTags().keys():
                        if sequenceName != chromosome:
                            continue
                        else:
                            subSequence = iParser.getSubSequence(sequenceName, exon.getStart() , exon.getEnd(), 1)
                            GCpercent, nPercent = iGetGCPercentBySW.getGCPercentAccordingToNAndNPercent(subSequence)
                            print "GCpercent = %f, nPercent = %f" % (GCpercent, nPercent)
            transcript.setTagValue("GCpercent", GCpercent)
            transcript.setTagValue("NPercent", nPercent)
            self.outputWriter.addTranscript(transcript)
            progress.inc()
        progress.done()
 
    def run(self):
        self.readGffAnnotation()
        if self.outputWriter != None:
            self.write()
            
if __name__ == "__main__":
        description = "Count GC percent for each read against a genome."
        usage = "CountReadGCPercent.py -i <fasta file> -j <gff3 file> -o <output gff3 file> -v <verbose> -h]"
        examples = "\nExample: \n"
        examples += "\t$ python CountReadGCPercent.py -i file.fasta -j annotation.gff -o output.gff3"
        examples += "\n\n"
        parser = RepetOptionParser(description = description, usage = usage, version = "v1.0", epilog = examples)
        parser.add_option( '-i', '--inputGenome', dest='fastaFile', help='fasta file [compulsory]', default= None )
        parser.add_option( '-j', '--inputAnnotation', dest='gffFile', help='gff3 file [compulsory]', default= None)
        parser.add_option( '-o', '--output', dest='outputFile', help='output gff3 file [compulsory]', default= None )
        parser.add_option( '-v', '--verbose', dest='verbose', help='verbosity level (default=0/1)',type="int", default= 0 )
        (options, args) = parser.parse_args()
    
        readGCPercent = CountReadGCPercent()
        readGCPercent.setInputReferenceFile(options.fastaFile)
        readGCPercent.setInputGffFile(options.gffFile)
        readGCPercent.setOutputFileName(options.outputFile)
        readGCPercent.run()