| 18 | 1 #! /usr/bin/env python | 
|  | 2 # | 
|  | 3 # Copyright INRA-URGI 2009-2012 | 
|  | 4 # | 
|  | 5 # This software is governed by the CeCILL license under French law and | 
|  | 6 # abiding by the rules of distribution of free software. You can use, | 
|  | 7 # modify and/ or redistribute the software under the terms of the CeCILL | 
|  | 8 # license as circulated by CEA, CNRS and INRIA at the following URL | 
|  | 9 # "http://www.cecill.info". | 
|  | 10 # | 
|  | 11 # As a counterpart to the access to the source code and rights to copy, | 
|  | 12 # modify and redistribute granted by the license, users are provided only | 
|  | 13 # with a limited warranty and the software's author, the holder of the | 
|  | 14 # economic rights, and the successive licensors have only limited | 
|  | 15 # liability. | 
|  | 16 # | 
|  | 17 # In this respect, the user's attention is drawn to the risks associated | 
|  | 18 # with loading, using, modifying and/or developing or reproducing the | 
|  | 19 # software by the user in light of its specific status of free software, | 
|  | 20 # that may mean that it is complicated to manipulate, and that also | 
|  | 21 # therefore means that it is reserved for developers and experienced | 
|  | 22 # professionals having in-depth computer knowledge. Users are therefore | 
|  | 23 # encouraged to load and test the software's suitability as regards their | 
|  | 24 # requirements in conditions enabling the security of their systems and/or | 
|  | 25 # data to be ensured and, more generally, to use and operate it in the | 
|  | 26 # same conditions as regards security. | 
|  | 27 # | 
|  | 28 # The fact that you are presently reading this means that you have had | 
|  | 29 # knowledge of the CeCILL license and that you accept its terms. | 
|  | 30 # | 
|  | 31 import os, os.path, random | 
|  | 32 from optparse import OptionParser | 
|  | 33 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer | 
|  | 34 from commons.core.parsing.GffParser import GffParser | 
|  | 35 from commons.core.writer.Gff3Writer import Gff3Writer | 
|  | 36 from commons.core.writer.TranscriptWriter import TranscriptWriter | 
|  | 37 from SMART.Java.Python.misc.Progress import Progress | 
|  | 38 from SMART.Java.Python.misc.RPlotter import RPlotter | 
|  | 39 from SMART.Java.Python.cleanGff import CleanGff | 
|  | 40 from SMART.Java.Python.CompareOverlappingSmallRef import CompareOverlappingSmallRef | 
|  | 41 from SMART.Java.Python.structure.TranscriptListsComparator import TranscriptListsComparator | 
|  | 42 from SMART.Java.Python.GetUpDownStream import GetUpDownStream | 
|  | 43 | 
|  | 44 REFERENCE = 0 | 
|  | 45 QUERY = 1 | 
|  | 46 | 
|  | 47 class CountLoci(object): | 
|  | 48 | 
|  | 49     def __init__(self, verbosity = 1): | 
|  | 50         self.verbosity = verbosity | 
|  | 51         self.tmpFileNames = [] | 
|  | 52 | 
|  | 53     def __del__(self): | 
|  | 54         for fileName in self.tmpFileNames: | 
|  | 55             if os.path.exists(fileName): | 
|  | 56                 os.remove(fileName) | 
|  | 57 | 
|  | 58     def setInputFile(self, fileName, format): | 
|  | 59         self.inputFileName = fileName | 
|  | 60         self.inputFormat = format | 
|  | 61         self.parser = TranscriptContainer(fileName, format, self.verbosity-1) | 
|  | 62         if self.verbosity > 0: | 
|  | 63             print "%d elements in input" % (self.parser.getNbTranscripts()) | 
|  | 64 | 
|  | 65     def setReference(self, fileName): | 
|  | 66         self.referenceFileName = fileName | 
|  | 67 | 
|  | 68     def setDistance(self, distance): | 
|  | 69         self.distance = distance | 
|  | 70 | 
|  | 71     def setOutputFileName(self, fileName): | 
|  | 72         self.outputFileName = fileName | 
|  | 73         self.writer         = Gff3Writer(fileName, self.verbosity-1) | 
|  | 74         self.outputBase     = "%s_%d_" % (os.path.splitext(fileName)[0], random.randint(0, 10000)) | 
|  | 75 | 
|  | 76     def _writeTmpRef(self, tags, outputFileName): | 
|  | 77         cleanGff = CleanGff(self.verbosity-1) | 
|  | 78         cleanGff.setInputFileName(self.referenceFileName) | 
|  | 79         cleanGff.setOutputFileName(outputFileName) | 
|  | 80         cleanGff.setAcceptedTypes(tags) | 
|  | 81         cleanGff.run() | 
|  | 82 | 
|  | 83     def _getReferenceFiles(self): | 
|  | 84         self.referenceFiles = {"CDS":                       "%scds.gff3"      % (self.outputBase), \ | 
|  | 85                                "five_prime_UTR":            "%sfive.gff3"     % (self.outputBase), \ | 
|  | 86                                "three_prime_UTR":           "%sthree.gff3"    % (self.outputBase), \ | 
|  | 87                                "mRNA":                      "%smrna.gff3"     % (self.outputBase), \ | 
|  | 88                                "ncRNA":                     "%sncRNA.gff3"    % (self.outputBase), \ | 
|  | 89                                "transposable_element_gene": "%sTE.gff3"       % (self.outputBase), \ | 
|  | 90                                "vic":                       "%svicinity.gff3" % (self.outputBase)} | 
|  | 91         self.tmpFileNames.extend(self.referenceFiles.values()) | 
|  | 92         for tag, fileName in self.referenceFiles.iteritems(): | 
|  | 93             if tag == "ncRNA": | 
|  | 94                 self._writeTmpRef(["miRNA", "ncRNA", "rRNA", "snoRNA", "snRNA", "tRNA"], fileName) | 
|  | 95             elif tag == "vic": | 
|  | 96                 continue | 
|  | 97             else: | 
|  | 98                 self._writeTmpRef([tag], fileName) | 
|  | 99 | 
|  | 100     def _compare(self, queryFileName, queryFormat, referenceFileName, referenceFormat, outputFileName, exclusion = False): | 
|  | 101         co = CompareOverlappingSmallRef(self.verbosity-1) | 
|  | 102         co.setQueryFile(queryFileName, queryFormat) | 
|  | 103         co.setReferenceFile(referenceFileName, referenceFormat) | 
|  | 104         co.setOutputFile(outputFileName) | 
|  | 105         if exclusion: | 
|  | 106             co.setInvert(True) | 
|  | 107         co.run() | 
|  | 108         return co.nbWritten | 
|  | 109 | 
|  | 110     def _copy(self, inputFile, tag): | 
|  | 111         parser = GffParser(inputFile, self.verbosity-1) | 
|  | 112         for transcript in parser.getIterator(): | 
|  | 113             transcript.setTagValue("locus", tag) | 
|  | 114             self.writer.addTranscript(transcript) | 
|  | 115 | 
|  | 116     def _getCds(self): | 
|  | 117         outputFileName   = "%sin_cds.gff3" % (self.outputBase) | 
|  | 118         outputNoFileName = "%sin_nocds.gff3" % (self.outputBase) | 
|  | 119         self.tmpFileNames.extend([outputFileName, outputNoFileName]) | 
|  | 120         nbOverlaps = self._compare(self.inputFileName, self.inputFormat, self.referenceFiles["CDS"], "gff3", outputFileName) | 
|  | 121         self._compare(self.inputFileName, self.inputFormat, self.referenceFiles["CDS"], "gff3", outputNoFileName, True) | 
|  | 122         self._copy(outputFileName, "CDS") | 
|  | 123         if self.verbosity > 0: | 
|  | 124             print "%d overlaps in CDS" % (nbOverlaps) | 
|  | 125         return outputNoFileName | 
|  | 126 | 
|  | 127     def _getFivePrime(self, inputFileName): | 
|  | 128         outputFileName   = "%sin_five.gff3" % (self.outputBase) | 
|  | 129         outputNoFileName = "%sin_nofive.gff3" % (self.outputBase) | 
|  | 130         self.tmpFileNames.extend([outputFileName, outputNoFileName]) | 
|  | 131         nbOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["five_prime_UTR"], "gff3", outputFileName) | 
|  | 132         self._compare(inputFileName, "gff3", self.referenceFiles["five_prime_UTR"], "gff3", outputNoFileName, True) | 
|  | 133         self._copy(outputFileName, "five_prime_UTR") | 
|  | 134         if self.verbosity > 0: | 
|  | 135             print "%d overlaps in 5' UTR" % (nbOverlaps) | 
|  | 136         return outputNoFileName | 
|  | 137 | 
|  | 138     def _getThreePrime(self, inputFileName): | 
|  | 139         outputFileName   = "%sin_three.gff3" % (self.outputBase) | 
|  | 140         outputNoFileName = "%sin_nothree.gff3" % (self.outputBase) | 
|  | 141         self.tmpFileNames.extend([outputFileName, outputNoFileName]) | 
|  | 142         nbOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["three_prime_UTR"], "gff3", outputFileName) | 
|  | 143         self._compare(inputFileName, "gff3", self.referenceFiles["three_prime_UTR"], "gff3", outputNoFileName, True) | 
|  | 144         self._copy(outputFileName, "three_prime_UTR") | 
|  | 145         if self.verbosity > 0: | 
|  | 146             print "%d overlaps in 3' UTR" % (nbOverlaps) | 
|  | 147         return outputNoFileName | 
|  | 148 | 
|  | 149     def _getNcRna(self, inputFileName): | 
|  | 150         outputFileName   = "%sin_ncRna.gff3" % (self.outputBase) | 
|  | 151         outputNoFileName = "%sin_noNcRna.gff3" % (self.outputBase) | 
|  | 152         self.tmpFileNames.extend([outputFileName, outputNoFileName]) | 
|  | 153         nbOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["ncRNA"], "gff3", outputFileName) | 
|  | 154         self._compare(inputFileName, "gff3", self.referenceFiles["ncRNA"], "gff3", outputNoFileName, True) | 
|  | 155         self._copy(outputFileName, "ncRNA") | 
|  | 156         if self.verbosity > 0: | 
|  | 157             print "%d overlaps in ncRNA" % (nbOverlaps) | 
|  | 158         return outputNoFileName | 
|  | 159 | 
|  | 160     def _getTe(self, inputFileName): | 
|  | 161         outputFileName   = "%sin_te.gff3" % (self.outputBase) | 
|  | 162         outputNoFileName = "%sin_noTe.gff3" % (self.outputBase) | 
|  | 163         self.tmpFileNames.extend([outputFileName, outputNoFileName]) | 
|  | 164         nbOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["transposable_element_gene"], "gff3", outputFileName) | 
|  | 165         self._compare(inputFileName, "gff3", self.referenceFiles["transposable_element_gene"], "gff3", outputNoFileName, True) | 
|  | 166         self._copy(outputFileName, "TE") | 
|  | 167         if self.verbosity > 0: | 
|  | 168             print "%d overlaps in TE" % (nbOverlaps) | 
|  | 169         return outputNoFileName | 
|  | 170 | 
|  | 171     def _getIntron(self, inputFileName): | 
|  | 172         outputFileName   = "%sin_intron.gff3" % (self.outputBase) | 
|  | 173         outputNoFileName = "%sin_nointron.gff3" % (self.outputBase) | 
|  | 174         self.tmpFileNames.extend([outputFileName, outputNoFileName]) | 
|  | 175         nbOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["mRNA"], "gff3", outputFileName) | 
|  | 176         self._compare(inputFileName, "gff3", self.referenceFiles["mRNA"], "gff3", outputNoFileName, True) | 
|  | 177         self._copy(outputFileName, "intron") | 
|  | 178         if self.verbosity > 0: | 
|  | 179             print "%d overlaps in introns" % (nbOverlaps) | 
|  | 180         return outputNoFileName | 
|  | 181 | 
|  | 182     def _getVicinity(self, inputFileName): | 
|  | 183         guds = GetUpDownStream(self.verbosity-1) | 
|  | 184         guds.setInputFile(self.referenceFiles["mRNA"], "gff3") | 
|  | 185         guds.setOutputFile(self.referenceFiles["vic"]) | 
|  | 186         guds.setDistances(self.distance, self.distance) | 
|  | 187         guds.run() | 
|  | 188         outputFileName = "%sout_vicinity.gff3" % (self.outputBase) | 
|  | 189         outputNoFileName = "%sout_novicinity.gff3" % (self.outputBase) | 
|  | 190         self.tmpFileNames.extend([outputFileName, outputNoFileName]) | 
|  | 191         nbOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["vic"], "gff3", outputFileName) | 
|  | 192         nbNoOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["vic"], "gff3", outputNoFileName, True) | 
|  | 193         self._copy(outputFileName, "vicinity") | 
|  | 194         self._copy(outputNoFileName, "intergenic") | 
|  | 195         if self.verbosity > 0: | 
|  | 196             print "%d overlaps in vicinity" % (nbOverlaps) | 
|  | 197             print "%d elsewhere" % (nbNoOverlaps) | 
|  | 198 | 
|  | 199     def run(self): | 
|  | 200         self._getReferenceFiles() | 
|  | 201         outputFileName = self._getCds() | 
|  | 202         outputFileName = self._getFivePrime(outputFileName) | 
|  | 203         outputFileName = self._getThreePrime(outputFileName) | 
|  | 204         outputFileName = self._getNcRna(outputFileName) | 
|  | 205         outputFileName = self._getTe(outputFileName) | 
|  | 206         outputFileName = self._getIntron(outputFileName) | 
|  | 207         self._getVicinity(outputFileName) | 
|  | 208 | 
|  | 209 | 
|  | 210 | 
|  | 211 if __name__ == "__main__": | 
|  | 212 | 
|  | 213     # parse command line | 
|  | 214     description = "Count Loci v1.0.0: Count input elements with respect to CDS, 5' UTR, 3' UTR, intron, downstream, upstream. [Category: Personal]" | 
|  | 215 | 
|  | 216     parser = OptionParser(description = description) | 
|  | 217     parser.add_option("-i", "--input",     dest="inputFileName",  action="store",            type="string", help="input file              [compulsory] [format: file in transcript format given by -f]") | 
|  | 218     parser.add_option("-f", "--format",    dest="format",         action="store",            type="string", help="format of the input     [compulsory] [format: transcript file format]") | 
|  | 219     parser.add_option("-r", "--reference", dest="reference",      action="store",            type="string", help="reference file          [compulsory] [format: file in GFF format]") | 
|  | 220     parser.add_option("-o", "--output",    dest="outputFileName", action="store",            type="string", help="output file             [compulsory] [format: output file in GFF3 format]") | 
|  | 221     parser.add_option("-d", "--distance",  dest="distance",       action="store",            type="int",    help="distance up/down stream [compulsory] [format: output file in GFF3 format]") | 
|  | 222     parser.add_option("-v", "--verbosity", dest="verbosity",      action="store", default=1, type="int",    help="trace level                          [format: int]") | 
|  | 223     (options, args) = parser.parse_args() | 
|  | 224 | 
|  | 225     cl = CountLoci(options.verbosity) | 
|  | 226     cl.setInputFile(options.inputFileName, options.format) | 
|  | 227     cl.setDistance(options.distance) | 
|  | 228     cl.setReference(options.reference) | 
|  | 229     cl.setOutputFileName(options.outputFileName) | 
|  | 230     cl.run() |