comparison SMART/Java/Python/CountLoci.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children 0ab839023fe4 169d364ddd91
comparison
equal deleted inserted replaced
17:b0e8584489e6 18:94ab73e8a190
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2012
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 import os, os.path, random
32 from optparse import OptionParser
33 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
34 from commons.core.parsing.GffParser import GffParser
35 from commons.core.writer.Gff3Writer import Gff3Writer
36 from commons.core.writer.TranscriptWriter import TranscriptWriter
37 from SMART.Java.Python.misc.Progress import Progress
38 from SMART.Java.Python.misc.RPlotter import RPlotter
39 from SMART.Java.Python.cleanGff import CleanGff
40 from SMART.Java.Python.CompareOverlappingSmallRef import CompareOverlappingSmallRef
41 from SMART.Java.Python.structure.TranscriptListsComparator import TranscriptListsComparator
42 from SMART.Java.Python.GetUpDownStream import GetUpDownStream
43
44 REFERENCE = 0
45 QUERY = 1
46
47 class CountLoci(object):
48
49 def __init__(self, verbosity = 1):
50 self.verbosity = verbosity
51 self.tmpFileNames = []
52
53 def __del__(self):
54 for fileName in self.tmpFileNames:
55 if os.path.exists(fileName):
56 os.remove(fileName)
57
58 def setInputFile(self, fileName, format):
59 self.inputFileName = fileName
60 self.inputFormat = format
61 self.parser = TranscriptContainer(fileName, format, self.verbosity-1)
62 if self.verbosity > 0:
63 print "%d elements in input" % (self.parser.getNbTranscripts())
64
65 def setReference(self, fileName):
66 self.referenceFileName = fileName
67
68 def setDistance(self, distance):
69 self.distance = distance
70
71 def setOutputFileName(self, fileName):
72 self.outputFileName = fileName
73 self.writer = Gff3Writer(fileName, self.verbosity-1)
74 self.outputBase = "%s_%d_" % (os.path.splitext(fileName)[0], random.randint(0, 10000))
75
76 def _writeTmpRef(self, tags, outputFileName):
77 cleanGff = CleanGff(self.verbosity-1)
78 cleanGff.setInputFileName(self.referenceFileName)
79 cleanGff.setOutputFileName(outputFileName)
80 cleanGff.setAcceptedTypes(tags)
81 cleanGff.run()
82
83 def _getReferenceFiles(self):
84 self.referenceFiles = {"CDS": "%scds.gff3" % (self.outputBase), \
85 "five_prime_UTR": "%sfive.gff3" % (self.outputBase), \
86 "three_prime_UTR": "%sthree.gff3" % (self.outputBase), \
87 "mRNA": "%smrna.gff3" % (self.outputBase), \
88 "ncRNA": "%sncRNA.gff3" % (self.outputBase), \
89 "transposable_element_gene": "%sTE.gff3" % (self.outputBase), \
90 "vic": "%svicinity.gff3" % (self.outputBase)}
91 self.tmpFileNames.extend(self.referenceFiles.values())
92 for tag, fileName in self.referenceFiles.iteritems():
93 if tag == "ncRNA":
94 self._writeTmpRef(["miRNA", "ncRNA", "rRNA", "snoRNA", "snRNA", "tRNA"], fileName)
95 elif tag == "vic":
96 continue
97 else:
98 self._writeTmpRef([tag], fileName)
99
100 def _compare(self, queryFileName, queryFormat, referenceFileName, referenceFormat, outputFileName, exclusion = False):
101 co = CompareOverlappingSmallRef(self.verbosity-1)
102 co.setQueryFile(queryFileName, queryFormat)
103 co.setReferenceFile(referenceFileName, referenceFormat)
104 co.setOutputFile(outputFileName)
105 if exclusion:
106 co.setInvert(True)
107 co.run()
108 return co.nbWritten
109
110 def _copy(self, inputFile, tag):
111 parser = GffParser(inputFile, self.verbosity-1)
112 for transcript in parser.getIterator():
113 transcript.setTagValue("locus", tag)
114 self.writer.addTranscript(transcript)
115
116 def _getCds(self):
117 outputFileName = "%sin_cds.gff3" % (self.outputBase)
118 outputNoFileName = "%sin_nocds.gff3" % (self.outputBase)
119 self.tmpFileNames.extend([outputFileName, outputNoFileName])
120 nbOverlaps = self._compare(self.inputFileName, self.inputFormat, self.referenceFiles["CDS"], "gff3", outputFileName)
121 self._compare(self.inputFileName, self.inputFormat, self.referenceFiles["CDS"], "gff3", outputNoFileName, True)
122 self._copy(outputFileName, "CDS")
123 if self.verbosity > 0:
124 print "%d overlaps in CDS" % (nbOverlaps)
125 return outputNoFileName
126
127 def _getFivePrime(self, inputFileName):
128 outputFileName = "%sin_five.gff3" % (self.outputBase)
129 outputNoFileName = "%sin_nofive.gff3" % (self.outputBase)
130 self.tmpFileNames.extend([outputFileName, outputNoFileName])
131 nbOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["five_prime_UTR"], "gff3", outputFileName)
132 self._compare(inputFileName, "gff3", self.referenceFiles["five_prime_UTR"], "gff3", outputNoFileName, True)
133 self._copy(outputFileName, "five_prime_UTR")
134 if self.verbosity > 0:
135 print "%d overlaps in 5' UTR" % (nbOverlaps)
136 return outputNoFileName
137
138 def _getThreePrime(self, inputFileName):
139 outputFileName = "%sin_three.gff3" % (self.outputBase)
140 outputNoFileName = "%sin_nothree.gff3" % (self.outputBase)
141 self.tmpFileNames.extend([outputFileName, outputNoFileName])
142 nbOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["three_prime_UTR"], "gff3", outputFileName)
143 self._compare(inputFileName, "gff3", self.referenceFiles["three_prime_UTR"], "gff3", outputNoFileName, True)
144 self._copy(outputFileName, "three_prime_UTR")
145 if self.verbosity > 0:
146 print "%d overlaps in 3' UTR" % (nbOverlaps)
147 return outputNoFileName
148
149 def _getNcRna(self, inputFileName):
150 outputFileName = "%sin_ncRna.gff3" % (self.outputBase)
151 outputNoFileName = "%sin_noNcRna.gff3" % (self.outputBase)
152 self.tmpFileNames.extend([outputFileName, outputNoFileName])
153 nbOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["ncRNA"], "gff3", outputFileName)
154 self._compare(inputFileName, "gff3", self.referenceFiles["ncRNA"], "gff3", outputNoFileName, True)
155 self._copy(outputFileName, "ncRNA")
156 if self.verbosity > 0:
157 print "%d overlaps in ncRNA" % (nbOverlaps)
158 return outputNoFileName
159
160 def _getTe(self, inputFileName):
161 outputFileName = "%sin_te.gff3" % (self.outputBase)
162 outputNoFileName = "%sin_noTe.gff3" % (self.outputBase)
163 self.tmpFileNames.extend([outputFileName, outputNoFileName])
164 nbOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["transposable_element_gene"], "gff3", outputFileName)
165 self._compare(inputFileName, "gff3", self.referenceFiles["transposable_element_gene"], "gff3", outputNoFileName, True)
166 self._copy(outputFileName, "TE")
167 if self.verbosity > 0:
168 print "%d overlaps in TE" % (nbOverlaps)
169 return outputNoFileName
170
171 def _getIntron(self, inputFileName):
172 outputFileName = "%sin_intron.gff3" % (self.outputBase)
173 outputNoFileName = "%sin_nointron.gff3" % (self.outputBase)
174 self.tmpFileNames.extend([outputFileName, outputNoFileName])
175 nbOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["mRNA"], "gff3", outputFileName)
176 self._compare(inputFileName, "gff3", self.referenceFiles["mRNA"], "gff3", outputNoFileName, True)
177 self._copy(outputFileName, "intron")
178 if self.verbosity > 0:
179 print "%d overlaps in introns" % (nbOverlaps)
180 return outputNoFileName
181
182 def _getVicinity(self, inputFileName):
183 guds = GetUpDownStream(self.verbosity-1)
184 guds.setInputFile(self.referenceFiles["mRNA"], "gff3")
185 guds.setOutputFile(self.referenceFiles["vic"])
186 guds.setDistances(self.distance, self.distance)
187 guds.run()
188 outputFileName = "%sout_vicinity.gff3" % (self.outputBase)
189 outputNoFileName = "%sout_novicinity.gff3" % (self.outputBase)
190 self.tmpFileNames.extend([outputFileName, outputNoFileName])
191 nbOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["vic"], "gff3", outputFileName)
192 nbNoOverlaps = self._compare(inputFileName, "gff3", self.referenceFiles["vic"], "gff3", outputNoFileName, True)
193 self._copy(outputFileName, "vicinity")
194 self._copy(outputNoFileName, "intergenic")
195 if self.verbosity > 0:
196 print "%d overlaps in vicinity" % (nbOverlaps)
197 print "%d elsewhere" % (nbNoOverlaps)
198
199 def run(self):
200 self._getReferenceFiles()
201 outputFileName = self._getCds()
202 outputFileName = self._getFivePrime(outputFileName)
203 outputFileName = self._getThreePrime(outputFileName)
204 outputFileName = self._getNcRna(outputFileName)
205 outputFileName = self._getTe(outputFileName)
206 outputFileName = self._getIntron(outputFileName)
207 self._getVicinity(outputFileName)
208
209
210
211 if __name__ == "__main__":
212
213 # parse command line
214 description = "Count Loci v1.0.0: Count input elements with respect to CDS, 5' UTR, 3' UTR, intron, downstream, upstream. [Category: Personal]"
215
216 parser = OptionParser(description = description)
217 parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]")
218 parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input [compulsory] [format: transcript file format]")
219 parser.add_option("-r", "--reference", dest="reference", action="store", type="string", help="reference file [compulsory] [format: file in GFF format]")
220 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]")
221 parser.add_option("-d", "--distance", dest="distance", action="store", type="int", help="distance up/down stream [compulsory] [format: output file in GFF3 format]")
222 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
223 (options, args) = parser.parse_args()
224
225 cl = CountLoci(options.verbosity)
226 cl.setInputFile(options.inputFileName, options.format)
227 cl.setDistance(options.distance)
228 cl.setReference(options.reference)
229 cl.setOutputFileName(options.outputFileName)
230 cl.run()