Mercurial > repos > yufei-luo > s_mart
view SMART/Java/Python/CountReadGCPercent.py @ 69:1473ab954708 draft
Corrected bug in "CollapsedReads" XML file.
author | m-zytnicki |
---|---|
date | Wed, 18 Nov 2015 10:59:02 -0500 |
parents | 769e306b7933 |
children |
line wrap: on
line source
#!/usr/bin/env python from optparse import OptionParser from commons.core.parsing.FastaParser import FastaParser from commons.core.writer.Gff3Writer import Gff3Writer from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer from SMART.Java.Python.misc.Progress import Progress from commons.core.utils.RepetOptionParser import RepetOptionParser from Gnome_tools.CountGCPercentBySlidingWindow import CountGCPercentBySlidingWindow class CountReadGCPercent(object): def __init__(self): self.referenceReader = None self.gffReader = None self.outputWriter = None self.verbose = 0 def setInputReferenceFile(self, fileName): self.referenceReader = fileName def setInputGffFile(self, fileName): self.gffReader = TranscriptContainer(fileName, 'gff3', self.verbose) def setOutputFileName(self, fileName): self.outputWriter = Gff3Writer(fileName, self.verbose) def readGffAnnotation(self): self.coveredRegions = {} progress = Progress(self.gffReader.getNbTranscripts(), "Reading gff3 annotation file", self.verbose) for transcript in self.gffReader.getIterator(): chromosome = transcript.getChromosome() if chromosome not in self.coveredRegions: self.coveredRegions[chromosome] = {} for exon in transcript.getExons(): for position in range(exon.getStart(), exon.getEnd()+1): self.coveredRegions[chromosome][position] = 1 progress.inc() progress.done() def write(self): iParser = FastaParser(self.referenceReader) iParser.setTags() iGetGCPercentBySW = CountGCPercentBySlidingWindow() progress = Progress(self.gffReader.getNbTranscripts(), "Writing output file", self.verbose) for transcript in self.gffReader.getIterator(): chromosome = transcript.getChromosome() GCpercent = 0 nPercent = 0 for exon in transcript.getExons(): for sequenceName in iParser.getTags().keys(): if sequenceName != chromosome: continue else: subSequence = iParser.getSubSequence(sequenceName, exon.getStart() , exon.getEnd(), 1) GCpercent, nPercent = iGetGCPercentBySW.getGCPercentAccordingToNAndNPercent(subSequence) print "GCpercent = %f, nPercent = %f" % (GCpercent, nPercent) transcript.setTagValue("GCpercent", GCpercent) transcript.setTagValue("NPercent", nPercent) self.outputWriter.addTranscript(transcript) progress.inc() progress.done() def run(self): self.readGffAnnotation() if self.outputWriter != None: self.write() if __name__ == "__main__": description = "Count GC percent for each read against a genome." usage = "CountReadGCPercent.py -i <fasta file> -j <gff3 file> -o <output gff3 file> -v <verbose> -h]" examples = "\nExample: \n" examples += "\t$ python CountReadGCPercent.py -i file.fasta -j annotation.gff -o output.gff3" examples += "\n\n" parser = RepetOptionParser(description = description, usage = usage, version = "v1.0", epilog = examples) parser.add_option( '-i', '--inputGenome', dest='fastaFile', help='fasta file [compulsory]', default= None ) parser.add_option( '-j', '--inputAnnotation', dest='gffFile', help='gff3 file [compulsory]', default= None) parser.add_option( '-o', '--output', dest='outputFile', help='output gff3 file [compulsory]', default= None ) parser.add_option( '-v', '--verbose', dest='verbose', help='verbosity level (default=0/1)',type="int", default= 0 ) (options, args) = parser.parse_args() readGCPercent = CountReadGCPercent() readGCPercent.setInputReferenceFile(options.fastaFile) readGCPercent.setInputGffFile(options.gffFile) readGCPercent.setOutputFileName(options.outputFile) readGCPercent.run()