Mercurial > repos > yufei-luo > s_mart
diff SMART/Java/Python/CountLoci.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children | 0ab839023fe4 169d364ddd91 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/CountLoci.py Mon Apr 29 03:20:15 2013 -0400 @@ -0,0 +1,230 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2012 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os, os.path, random +from optparse import OptionParser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from commons.core.parsing.GffParser import GffParser +from commons.core.writer.Gff3Writer import Gff3Writer +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.cleanGff import CleanGff +from SMART.Java.Python.CompareOverlappingSmallRef import CompareOverlappingSmallRef +from SMART.Java.Python.structure.TranscriptListsComparator import TranscriptListsComparator +from SMART.Java.Python.GetUpDownStream import GetUpDownStream + +REFERENCE = 0 +QUERY = 1 + +class CountLoci(object): + + def __init__(self, verbosity = 1): + self.verbosity = verbosity + self.tmpFileNames = [] + + def __del__(self): + for fileName in self.tmpFileNames: + if os.path.exists(fileName): + os.remove(fileName) + + def setInputFile(self, fileName, format): + self.inputFileName = fileName + self.inputFormat = format + self.parser = TranscriptContainer(fileName, format, self.verbosity-1) + if self.verbosity > 0: + print "%d elements in input" % (self.parser.getNbTranscripts()) + + def setReference(self, fileName): + self.referenceFileName = fileName + + def setDistance(self, distance): + self.distance = distance + + def setOutputFileName(self, fileName): + self.outputFileName = fileName + self.writer = Gff3Writer(fileName, self.verbosity-1) + self.outputBase = "%s_%d_" % (os.path.splitext(fileName)[0], random.randint(0, 10000)) + + def _writeTmpRef(self, tags, outputFileName): + cleanGff = CleanGff(self.verbosity-1) + cleanGff.setInputFileName(self.referenceFileName) + cleanGff.setOutputFileName(outputFileName) + cleanGff.setAcceptedTypes(tags) + cleanGff.run() + + def _getReferenceFiles(self): + self.referenceFiles = {"CDS": "%scds.gff3" % (self.outputBase), \ + "five_prime_UTR": "%sfive.gff3" % (self.outputBase), \ + "three_prime_UTR": "%sthree.gff3" % (self.outputBase), \ + "mRNA": "%smrna.gff3" % (self.outputBase), \ + "ncRNA": "%sncRNA.gff3" % (self.outputBase), \ + "transposable_element_gene": "%sTE.gff3" % (self.outputBase), \ + "vic": "%svicinity.gff3" % (self.outputBase)} + self.tmpFileNames.extend(self.referenceFiles.values()) + for tag, fileName in self.referenceFiles.iteritems(): + if tag == "ncRNA": + self._writeTmpRef(["miRNA", "ncRNA", "rRNA", "snoRNA", "snRNA", "tRNA"], fileName) + elif tag == "vic": + continue + else: + self._writeTmpRef([tag], fileName) + + def _compare(self, queryFileName, queryFormat, referenceFileName, referenceFormat, outputFileName, exclusion = False): + co = CompareOverlappingSmallRef(self.verbosity-1) + co.setQueryFile(queryFileName, queryFormat) + co.setReferenceFile(referenceFileName, referenceFormat) + co.setOutputFile(outputFileName) + if exclusion: + co.setInvert(True) + co.run() + return co.nbWritten + + def _copy(self, inputFile, tag): + parser = GffParser(inputFile, self.verbosity-1) + for transcript in parser.getIterator(): + transcript.setTagValue("locus", tag) + self.writer.addTranscript(transcript) + + def _getCds(self): + outputFileName = "%sin_cds.gff3" % (self.outputBase) + outputNoFileName = "%sin_nocds.gff3" % (self.outputBase) + self.tmpFileNames.extend([outputFileName, outputNoFileName]) + nbOverlaps = self._compare(self.inputFileName, self.inputFormat, self.referenceFiles["CDS"], "gff3", outputFileName) + self._compare(self.inputFileName, self.inputFormat, self.referenceFiles["CDS"], "gff3", outputNoFileName, True) + self._copy(outputFileName, "CDS") + if self.verbosity > 0: + print "%d overlaps in CDS" % (nbOverlaps) + return outputNoFileName + + def _getFivePrime(self, inputFileName): + outputFileName = "%sin_five.gff3" % (self.outputBase) + outputNoFileName = "%sin_nofive.gff3" % (self.outputBase) + self.tmpFileNames.extend([outputFileName, outputNoFileName]) + nbOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["five_prime_UTR"], "gff3", outputFileName) + self._compare(inputFileName, "gff3", self.referenceFiles["five_prime_UTR"], "gff3", outputNoFileName, True) + self._copy(outputFileName, "five_prime_UTR") + if self.verbosity > 0: + print "%d overlaps in 5' UTR" % (nbOverlaps) + return outputNoFileName + + def _getThreePrime(self, inputFileName): + outputFileName = "%sin_three.gff3" % (self.outputBase) + outputNoFileName = "%sin_nothree.gff3" % (self.outputBase) + self.tmpFileNames.extend([outputFileName, outputNoFileName]) + nbOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["three_prime_UTR"], "gff3", outputFileName) + self._compare(inputFileName, "gff3", self.referenceFiles["three_prime_UTR"], "gff3", outputNoFileName, True) + self._copy(outputFileName, "three_prime_UTR") + if self.verbosity > 0: + print "%d overlaps in 3' UTR" % (nbOverlaps) + return outputNoFileName + + def _getNcRna(self, inputFileName): + outputFileName = "%sin_ncRna.gff3" % (self.outputBase) + outputNoFileName = "%sin_noNcRna.gff3" % (self.outputBase) + self.tmpFileNames.extend([outputFileName, outputNoFileName]) + nbOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["ncRNA"], "gff3", outputFileName) + self._compare(inputFileName, "gff3", self.referenceFiles["ncRNA"], "gff3", outputNoFileName, True) + self._copy(outputFileName, "ncRNA") + if self.verbosity > 0: + print "%d overlaps in ncRNA" % (nbOverlaps) + return outputNoFileName + + def _getTe(self, inputFileName): + outputFileName = "%sin_te.gff3" % (self.outputBase) + outputNoFileName = "%sin_noTe.gff3" % (self.outputBase) + self.tmpFileNames.extend([outputFileName, outputNoFileName]) + nbOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["transposable_element_gene"], "gff3", outputFileName) + self._compare(inputFileName, "gff3", self.referenceFiles["transposable_element_gene"], "gff3", outputNoFileName, True) + self._copy(outputFileName, "TE") + if self.verbosity > 0: + print "%d overlaps in TE" % (nbOverlaps) + return outputNoFileName + + def _getIntron(self, inputFileName): + outputFileName = "%sin_intron.gff3" % (self.outputBase) + outputNoFileName = "%sin_nointron.gff3" % (self.outputBase) + self.tmpFileNames.extend([outputFileName, outputNoFileName]) + nbOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["mRNA"], "gff3", outputFileName) + self._compare(inputFileName, "gff3", self.referenceFiles["mRNA"], "gff3", outputNoFileName, True) + self._copy(outputFileName, "intron") + if self.verbosity > 0: + print "%d overlaps in introns" % (nbOverlaps) + return outputNoFileName + + def _getVicinity(self, inputFileName): + guds = GetUpDownStream(self.verbosity-1) + guds.setInputFile(self.referenceFiles["mRNA"], "gff3") + guds.setOutputFile(self.referenceFiles["vic"]) + guds.setDistances(self.distance, self.distance) + guds.run() + outputFileName = "%sout_vicinity.gff3" % (self.outputBase) + outputNoFileName = "%sout_novicinity.gff3" % (self.outputBase) + self.tmpFileNames.extend([outputFileName, outputNoFileName]) + nbOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["vic"], "gff3", outputFileName) + nbNoOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["vic"], "gff3", outputNoFileName, True) + self._copy(outputFileName, "vicinity") + self._copy(outputNoFileName, "intergenic") + if self.verbosity > 0: + print "%d overlaps in vicinity" % (nbOverlaps) + print "%d elsewhere" % (nbNoOverlaps) + + def run(self): + self._getReferenceFiles() + outputFileName = self._getCds() + outputFileName = self._getFivePrime(outputFileName) + outputFileName = self._getThreePrime(outputFileName) + outputFileName = self._getNcRna(outputFileName) + outputFileName = self._getTe(outputFileName) + outputFileName = self._getIntron(outputFileName) + self._getVicinity(outputFileName) + + + +if __name__ == "__main__": + + # parse command line + description = "Count Loci v1.0.0: Count input elements with respect to CDS, 5' UTR, 3' UTR, intron, downstream, upstream. [Category: Personal]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input [compulsory] [format: transcript file format]") + parser.add_option("-r", "--reference", dest="reference", action="store", type="string", help="reference file [compulsory] [format: file in GFF format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-d", "--distance", dest="distance", action="store", type="int", help="distance up/down stream [compulsory] [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + cl = CountLoci(options.verbosity) + cl.setInputFile(options.inputFileName, options.format) + cl.setDistance(options.distance) + cl.setReference(options.reference) + cl.setOutputFileName(options.outputFileName) + cl.run()