6
+ − 1 #! /usr/bin/env python
+ − 2 #
+ − 3 # Copyright INRA-URGI 2009-2010
+ − 4 #
+ − 5 # This software is governed by the CeCILL license under French law and
+ − 6 # abiding by the rules of distribution of free software. You can use,
+ − 7 # modify and/ or redistribute the software under the terms of the CeCILL
+ − 8 # license as circulated by CEA, CNRS and INRIA at the following URL
+ − 9 # "http://www.cecill.info".
+ − 10 #
+ − 11 # As a counterpart to the access to the source code and rights to copy,
+ − 12 # modify and redistribute granted by the license, users are provided only
+ − 13 # with a limited warranty and the software's author, the holder of the
+ − 14 # economic rights, and the successive licensors have only limited
+ − 15 # liability.
+ − 16 #
+ − 17 # In this respect, the user's attention is drawn to the risks associated
+ − 18 # with loading, using, modifying and/or developing or reproducing the
+ − 19 # software by the user in light of its specific status of free software,
+ − 20 # that may mean that it is complicated to manipulate, and that also
+ − 21 # therefore means that it is reserved for developers and experienced
+ − 22 # professionals having in-depth computer knowledge. Users are therefore
+ − 23 # encouraged to load and test the software's suitability as regards their
+ − 24 # requirements in conditions enabling the security of their systems and/or
+ − 25 # data to be ensured and, more generally, to use and operate it in the
+ − 26 # same conditions as regards security.
+ − 27 #
+ − 28 # The fact that you are presently reading this means that you have had
+ − 29 # knowledge of the CeCILL license and that you accept its terms.
+ − 30 #
+ − 31 """Get the differential expression between 2 conditions (2 files), on regions defined by a third file"""
+ − 32
+ − 33 import os, re
+ − 34 from optparse import OptionParser
+ − 35 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
+ − 36 from commons.core.writer.Gff3Writer import Gff3Writer
+ − 37 from SMART.Java.Python.misc.Progress import Progress
+ − 38 from SMART.Java.Python.misc.RPlotter import RPlotter
+ − 39 from SMART.Java.Python.misc import Utils
+ − 40 from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection
+ − 41 from SMART.Java.Python.structure.Transcript import Transcript
+ − 42
+ − 43 class GetDifferentialExpression(object):
+ − 44
+ − 45 def __init__(self, verbosity = 1):
+ − 46 self.verbosity = verbosity
+ − 47 self.mySqlConnection = MySqlConnection(verbosity)
+ − 48 self.inputs = (0, 1)
+ − 49 self.transcriptContainers = [None, None]
+ − 50 self.transcriptContainerRef = None
+ − 51 self.outputFileName = None
+ − 52 self.writer = None
+ − 53 self.tables = [None, None]
+ − 54 self.nbElements = [0, 0]
+ − 55
+ − 56 self.regionsToValues = {}
+ − 57 self.regionsToNames = {}
+ − 58 self.valuesToPvalues = {}
+ − 59
+ − 60 self.oriented = True
+ − 61 self.simpleNormalization = False
+ − 62 self.simpleNormalizationParameters = None
+ − 63 self.adjustedNormalization = False
+ − 64 self.fixedSizeFactor = None
+ − 65 self.normalizationSize = None
+ − 66 self.normalizationFactors = [1, 1]
+ − 67 self.fdr = None
+ − 68 self.fdrPvalue = None
+ − 69
+ − 70 self.plot = False
+ − 71 self.plotter = None
+ − 72 self.plotterName = None
+ − 73 self.points = {}
+ − 74
+ − 75
+ − 76 def setInputFile(self, i, fileName, fileFormat):
+ − 77 self.transcriptContainers[i] = TranscriptContainer(fileName, fileFormat, self.verbosity)
+ − 78 self.transcriptContainers[i].mySqlConnection = self.mySqlConnection
+ − 79
+ − 80
+ − 81 def setReferenceFile(self, fileName, fileFormat):
+ − 82 self.transcriptContainerRef = TranscriptContainer(fileName, fileFormat, self.verbosity)
+ − 83 self.transcriptContainerRef.mySqlConnection = self.mySqlConnection
+ − 84
+ − 85
+ − 86 def setOutputFile(self, fileName):
+ − 87 self.outputFileName = fileName
+ − 88 self.writer = Gff3Writer(fileName, self.verbosity)
+ − 89
+ − 90
+ − 91 def setOriented(self, boolean):
+ − 92 self.oriented = boolean
+ − 93
+ − 94
+ − 95 def setSimpleNormalization(self, boolean):
+ − 96 self.simpleNormalization = boolean
+ − 97
+ − 98
+ − 99 def setSimpleNormalizationParameters(self, parameters):
+ − 100 if parameters != None:
+ − 101 self.simpleNormalization = True
+ − 102 self.simpleNormalizationParameters = [0, 0]
+ − 103 for i, splittedParameter in enumerate(parameters.split(",")):
+ − 104 self.simpleNormalizationParameters[i] = int(splittedParameter)
+ − 105
+ − 106
+ − 107 def setAdjustedNormalization(self, boolean):
+ − 108 self.adjustedNormalization = boolean
+ − 109
+ − 110
+ − 111 def setFixedSizeNormalization(self, value):
+ − 112 self.fixedSizeFactor = value
+ − 113
+ − 114
+ − 115 def setFdr(self, fdr):
+ − 116 self.fdr = fdr
+ − 117
+ − 118
+ − 119 def setPlot(self, boolean):
+ − 120 self.plot = boolean
+ − 121
+ − 122
+ − 123 def setPlotterName(self, plotterName):
+ − 124 self.plotterName = plotterName
+ − 125
+ − 126 def setPlotter(self):
+ − 127 self.plot = True
+ − 128 self.plotter = RPlotter(self.plotterName, self.verbosity)
+ − 129 self.plotter.setPoints(True)
+ − 130 self.plotter.setLog("xy")
+ − 131 self.points = {}
+ − 132
+ − 133
+ − 134 def readInput(self, i):
+ − 135 self.transcriptContainers[i].storeIntoDatabase()
+ − 136 self.tables[i] = self.transcriptContainers[i].getTables()
+ − 137 progress = Progress(len(self.tables[i].keys()), "Adding indices", self.verbosity)
+ − 138 for chromosome in self.tables[i]:
+ − 139 if self.oriented:
+ − 140 self.tables[i][chromosome].createIndex("iStartEndDir_%s_%d" % (chromosome, i), ("start", "end", "direction"))
+ − 141 else:
+ − 142 self.tables[i][chromosome].createIndex("iStartEnd_%s_%d" % (chromosome, i), ("start", "end"))
+ − 143 progress.inc()
+ − 144 progress.done()
+ − 145
+ − 146 progress = Progress(self.transcriptContainers[i].getNbTranscripts(), "Reading sample %d" % (i +1), self.verbosity)
+ − 147 for chromosome in self.tables[i]:
+ − 148 for transcript in self.tables[i][chromosome].getIterator():
+ − 149 self.nbElements[i] += 1 if "nbElements" not in transcript.getTagNames() else transcript.getTagValue("nbElements")
+ − 150 progress.inc()
+ − 151 progress.done()
+ − 152 if self.verbosity > 0:
+ − 153 print "%d elements in sample %d" % (self.nbElements[i], i+1)
+ − 154
+ − 155
+ − 156 def computeSimpleNormalizationFactors(self):
+ − 157 nbElements = self.nbElements
+ − 158 if self.simpleNormalizationParameters != None:
+ − 159 print "Using provided normalization parameters: %s" % (", ".join([str(parameter) for parameter in self.simpleNormalizationParameters]))
+ − 160 nbElements = self.simpleNormalizationParameters
+ − 161 avgNbElements = int(float(sum(nbElements)) / len(nbElements))
+ − 162 for i in self.inputs:
+ − 163 self.normalizationFactors[i] = float(avgNbElements) / nbElements[i]
+ − 164 self.nbElements[i] *= self.normalizationFactors[i]
+ − 165 if self.verbosity > 1:
+ − 166 print "Normalizing to average # reads: %d" % (avgNbElements)
+ − 167 if self.simpleNormalizationParameters != None:
+ − 168 print "# reads: %s" % (", ".join([str(nbElement) for nbElement in self.nbElements]))
+ − 169
+ − 170 def __del__(self):
+ − 171 self.mySqlConnection.deleteDatabase()
+ − 172
+ − 173 def regionToString(self, transcript):
+ − 174 return "%s:%d-%d(%s)" % (transcript.getChromosome(), transcript.getStart(), transcript.getEnd(), "+" if transcript.getDirection() == 1 else "-")
+ − 175
+ − 176 def stringToRegion(self, region):
+ − 177 m = re.search(r"^(\S+):(\d+)-(\d+)\((\S)\)$", region)
+ − 178 if m == None:
+ − 179 raise Exception("Internal format error: cannot parse region '%s'" % (region))
+ − 180 transcript = Transcript()
+ − 181 transcript.setChromosome(m.group(1))
+ − 182 transcript.setStart(int(m.group(2)))
+ − 183 transcript.setEnd(int(m.group(3)))
+ − 184 transcript.setDirection(m.group(4))
+ − 185 return transcript
+ − 186
+ − 187 def computeMinimumSize(self):
+ − 188 self.normalizationSize = 1000000000
+ − 189 progress = Progress(self.transcriptContainerRef.getNbTranscripts(), "Getting minimum reference size", self.verbosity)
+ − 190 for transcriptRef in self.transcriptContainerRef.getIterator():
+ − 191 self.normalizationSize = min(self.normalizationSize, transcriptRef.getEnd() - transcriptRef.getStart())
+ − 192 progress.inc()
+ − 193 progress.done()
+ − 194 if self.verbosity > 1:
+ − 195 print "Minimum reference size: %d" % (self.normalizationSize+1)
+ − 196
+ − 197 def useFixedSizeNormalization(self, start, end, starts):
+ − 198 currentNb = 0
+ − 199 sum = 0
+ − 200 if not starts:
+ − 201 return 0
+ − 202 for i in range(start - self.normalizationSize, end + 1 + self.normalizationSize):
+ − 203 if i not in starts:
+ − 204 starts[i] = 0
+ − 205 for i, s in starts.iteritems():
+ − 206 if i < start:
+ − 207 starts[start] += s
+ − 208 starts[i] = 0
+ − 209 for i in range(start - self.normalizationSize, end + 1):
+ − 210 currentNb += starts[i+self.normalizationSize] - starts[i]
+ − 211 sum += currentNb
+ − 212 return (float(sum) / self.normalizationSize) * (self.fixedSizeFactor / (end - start + 1))
+ − 213
+ − 214 def retrieveCounts(self, transcriptRef, i):
+ − 215 if transcriptRef.getChromosome() not in self.tables[i]:
+ − 216 return (0, 0)
+ − 217 cumulatedCount = 0
+ − 218 cumulatedNormalizedCount = 0
+ − 219 for exon in transcriptRef.getExons():
+ − 220 count = 0
+ − 221 starts = {}
+ − 222 command = "SELECT start, tags FROM '%s' WHERE start >= %d AND end <= %d" % (self.tables[i][exon.getChromosome()].getName(), exon.getStart(), exon.getEnd())
+ − 223 if self.oriented:
+ − 224 command += " AND direction = %d" % (exon.getDirection())
+ − 225 query = self.mySqlConnection.executeQuery(command)
+ − 226 for line in query.getIterator():
+ − 227 nb = 1
+ − 228 tags = line[1].split(";")
+ − 229 for tag in tags:
+ − 230 key, value = tag.split("=")
+ − 231 if key == "nbElements":
+ − 232 nb = int(float(value))
+ − 233 count += nb
+ − 234 starts[int(line[0])] = nb
+ − 235 normalizedCount = count if self.fixedSizeFactor == None else self.useFixedSizeNormalization(exon.getStart(), exon.getEnd(), starts)
+ − 236 cumulatedCount += count
+ − 237 cumulatedNormalizedCount += normalizedCount
+ − 238 return (cumulatedCount, cumulatedNormalizedCount)
+ − 239
+ − 240 def getAllCounts(self):
+ − 241 progress = Progress(self.transcriptContainerRef.getNbTranscripts(), "Getting counts", self.verbosity)
+ − 242 for cpt, transcriptRef in enumerate(self.transcriptContainerRef.getIterator()):
+ − 243 if "ID" in transcriptRef.getTagNames():
+ − 244 self.regionsToNames[self.regionToString(transcriptRef)] = transcriptRef.getTagValue("ID")
+ − 245 elif transcriptRef.getName() != None:
+ − 246 self.regionsToNames[self.regionToString(transcriptRef)] = transcriptRef.getName()
+ − 247 else:
+ − 248 self.regionsToNames[self.regionToString(transcriptRef)] = "region_%d" % (cpt)
+ − 249 values = [None, None]
+ − 250 normalizedValues = [None, None]
+ − 251 for i in self.inputs:
+ − 252 values[i], normalizedValues[i] = self.retrieveCounts(transcriptRef, i)
+ − 253 normalizedValues[i] = int(self.normalizationFactors[i] * normalizedValues[i])
+ − 254 if sum(values) != 0:
+ − 255 self.regionsToValues[self.regionToString(transcriptRef)] = (normalizedValues[0], normalizedValues[1], values[0], values[1])
+ − 256 progress.inc()
+ − 257 progress.done()
+ − 258
+ − 259 def computeAdjustedNormalizationFactors(self):
+ − 260 nbElements = len(self.regionsToValues.keys())
+ − 261 avgValues = []
+ − 262 progress = Progress(nbElements, "Normalization step 1", self.verbosity)
+ − 263 for values in self.regionsToValues.values():
+ − 264 correctedValues = [values[i] * self.normalizationFactors[i] for i in self.inputs]
+ − 265 avgValues.append(float(sum(correctedValues)) / len(correctedValues))
+ − 266 progress.inc()
+ − 267 progress.done()
+ − 268
+ − 269 sortedAvgValues = sorted(avgValues)
+ − 270 minAvgValues = sortedAvgValues[nbElements / 4]
+ − 271 maxAvgValues = sortedAvgValues[nbElements * 3 / 4]
+ − 272 sums = [0, 0]
+ − 273 progress = Progress(nbElements, "Normalization step 2", self.verbosity)
+ − 274 for values in self.regionsToValues.values():
+ − 275 correctedValues = [values[i] * self.normalizationFactors[i] for i in self.inputs]
+ − 276 avgValue = float(sum(correctedValues)) / len(correctedValues)
+ − 277 if minAvgValues <= avgValue and avgValue <= maxAvgValues:
+ − 278 for i in self.inputs:
+ − 279 sums[i] += values[i]
+ − 280 progress.inc()
+ − 281 progress.done()
+ − 282
+ − 283 avgSums = float(sum(sums)) / len(sums)
+ − 284 for i in self.inputs:
+ − 285 if self.verbosity > 1:
+ − 286 print "Normalizing sample %d: %s to" % ((i+1), self.nbElements[i]),
+ − 287 self.normalizationFactors[i] *= float(avgSums) / sums[i]
+ − 288 self.nbElements[i] *= self.normalizationFactors[i]
+ − 289 if self.verbosity > 1:
+ − 290 print "%s" % (int(self.nbElements[i]))
+ − 291
+ − 292 def getMinimumReferenceSize(self):
+ − 293 self.normalizationSize = 1000000000
+ − 294 progress = Progress(self.transcriptContainerRef.getNbTranscripts(), "Reference element sizes", self.verbosity)
+ − 295 for transcriptRef in self.transcriptContainerRef.getIterator():
+ − 296 self.normalizationSize = min(self.normalizationSize, transcriptRef.getEnd() - transcriptRef.getStart() + 1)
+ − 297 progress.inc()
+ − 298 progress.done()
+ − 299 if self.verbosity > 1:
+ − 300 print "Minimum reference size: %d" % (self.normalizationSize)
+ − 301
+ − 302 def computePvalues(self):
+ − 303 normalizedValues = set()
+ − 304 progress = Progress(len(self.regionsToValues.keys()), "Normalizing counts", self.verbosity)
+ − 305 for region in self.regionsToValues:
+ − 306 values = self.regionsToValues[region]
+ − 307 normalizedValues0 = int(round(values[0] * self.normalizationFactors[0]))
+ − 308 normalizedValues1 = int(round(values[1] * self.normalizationFactors[1]))
+ − 309 self.regionsToValues[region] = (normalizedValues0, normalizedValues1, self.regionsToValues[region][2], self.regionsToValues[region][3])
+ − 310 normalizedValues.add((normalizedValues0, normalizedValues1, self.nbElements[0] - normalizedValues0, self.nbElements[1] - normalizedValues1, self.regionsToValues[region][2], self.regionsToValues[region][3]))
+ − 311 progress.inc()
+ − 312 progress.done()
+ − 313
+ − 314 if self.verbosity > 1:
+ − 315 print "Computing p-values..."
+ − 316 self.valuesToPvalues = Utils.fisherExactPValueBulk(list(normalizedValues))
+ − 317 if self.verbosity > 1:
+ − 318 print "... done"
+ − 319
+ − 320 def setTagValues(self, transcript, values, pValue):
+ − 321 for tag in transcript.getTagNames():
+ − 322 transcript.deleteTag(tag)
+ − 323 transcript.removeExons()
+ − 324 transcript.setTagValue("pValue", str(pValue))
+ − 325 transcript.setTagValue("nbReadsCond1", str(values[0]))
+ − 326 transcript.setTagValue("nbReadsCond2", str(values[1]))
+ − 327 transcript.setTagValue("nbUnnormalizedReadsCond1", str(values[2]))
+ − 328 transcript.setTagValue("nbUnnormalizedReadsCond2", str(values[3]))
+ − 329 if (values[0] == values[1]) or (self.fdr != None and pValue > self.fdrPvalue):
+ − 330 transcript.setTagValue("regulation", "equal")
+ − 331 elif values[0] < values[1]:
+ − 332 transcript.setTagValue("regulation", "up")
+ − 333 else:
+ − 334 transcript.setTagValue("regulation", "down")
+ − 335 return transcript
+ − 336
+ − 337 def computeFdr(self):
+ − 338 pValues = []
+ − 339 nbRegions = len(self.regionsToValues.keys())
+ − 340 progress = Progress(nbRegions, "Computing FDR", self.verbosity)
+ − 341 for values in self.regionsToValues.values():
+ − 342 pValues.append(self.valuesToPvalues[values[0:2]])
+ − 343 progress.inc()
+ − 344 progress.done()
+ − 345
+ − 346 for i, pValue in enumerate(reversed(sorted(pValues))):
+ − 347 if pValue <= self.fdr * (nbRegions - 1 - i) / nbRegions:
+ − 348 self.fdrPvalue = pValue
+ − 349 if self.verbosity > 1:
+ − 350 print "FDR: %f, k: %i, m: %d" % (pValue, nbRegions - 1 - i, nbRegions)
+ − 351 return
+ − 352
+ − 353 def writeDifferentialExpression(self):
+ − 354 if self.plot:
+ − 355 self.setPlotter()
+ − 356
+ − 357 cpt = 1
+ − 358 progress = Progress(len(self.regionsToValues.keys()), "Writing output", self.verbosity)
+ − 359 for region, values in self.regionsToValues.iteritems():
+ − 360 transcript = self.stringToRegion(region)
+ − 361 pValue = self.valuesToPvalues[values[0:2]]
+ − 362 transcript.setName(self.regionsToNames[region])
+ − 363 transcript = self.setTagValues(transcript, values, pValue)
+ − 364 self.writer.addTranscript(transcript)
+ − 365 cpt += 1
+ − 366
+ − 367 if self.plot:
+ − 368 self.points[region] = (values[0], values[1])
+ − 369 progress.done()
+ − 370 self.writer.write()
+ − 371 self.writer.close()
+ − 372
+ − 373 if self.plot:
+ − 374 self.plotter.addLine(self.points)
+ − 375 self.plotter.plot()
+ − 376
+ − 377 def getDifferentialExpression(self):
+ − 378 for i in self.inputs:
+ − 379 self.readInput(i)
+ − 380
+ − 381 if self.simpleNormalization:
+ − 382 self.computeSimpleNormalizationFactors()
+ − 383 if self.fixedSizeFactor != None:
+ − 384 self.computeMinimumSize()
+ − 385
+ − 386 self.getAllCounts()
+ − 387
+ − 388 if self.adjustedNormalization:
+ − 389 self.computeAdjustedNormalizationFactors()
+ − 390
+ − 391 self.computePvalues()
+ − 392
+ − 393 if self.fdr != None:
+ − 394 self.computeFdr()
+ − 395
+ − 396 self.writeDifferentialExpression()
+ − 397
+ − 398
+ − 399 if __name__ == "__main__":
+ − 400
+ − 401 # parse command line
+ − 402 description = "Get Differential Expression v1.0.1: Get the differential expression between 2 conditions using Fisher's exact test, on regions defined by a third file. [Category: Data Comparison]"
+ − 403
+ − 404 parser = OptionParser(description = description)
+ − 405 parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]")
+ − 406 parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of file 1 [compulsory] [format: transcript file format]")
+ − 407 parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]")
+ − 408 parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of file 2 [compulsory] [format: transcript file format]")
+ − 409 parser.add_option("-k", "--reference", dest="referenceFileName", action="store", type="string", help="reference file [compulsory] [format: file in transcript format given by -l]")
+ − 410 parser.add_option("-l", "--referenceFormat", dest="referenceFormat", action="store", type="string", help="format of reference file [compulsory] [format: transcript file format]")
+ − 411 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in gff3 format]")
+ − 412 parser.add_option("-n", "--notOriented", dest="notOriented", action="store_true", default=False, help="if the reads are not oriented [default: False] [format: bool]")
+ − 413 parser.add_option("-s", "--simple", dest="simple", action="store_true", default=False, help="normalize using the number of reads in each condition [format: bool]")
+ − 414 parser.add_option("-S", "--simpleParameters", dest="simpleParameters", action="store", default=None, type="string", help="provide the number of reads [format: bool]")
+ − 415 parser.add_option("-a", "--adjusted", dest="adjusted", action="store_true", default=False, help="normalize using the number of reads of 'mean' regions [format: bool]")
+ − 416 parser.add_option("-x", "--fixedSizeFactor", dest="fixedSizeFactor", action="store", default=None, type="int", help="give the magnification factor for the normalization using fixed size sliding windows in reference regions (leave empty for no such normalization) [format: int]")
+ − 417 parser.add_option("-d", "--fdr", dest="fdr", action="store", default=None, type="float", help="use FDR [format: float]")
+ − 418 parser.add_option("-p", "--plot", dest="plotName", action="store", default=None, type="string", help="plot cloud plot [format: output file in PNG format]")
+ − 419 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
+ − 420 (options, args) = parser.parse_args()
+ − 421
+ − 422
+ − 423
+ − 424 differentialExpression = GetDifferentialExpression(options.verbosity)
+ − 425 differentialExpression.setInputFile(0, options.inputFileName1, options.format1)
+ − 426 differentialExpression.setInputFile(1, options.inputFileName2, options.format2)
+ − 427 differentialExpression.setReferenceFile(options.referenceFileName, options.referenceFormat)
+ − 428 differentialExpression.setOutputFile(options.outputFileName)
+ − 429 if options.plotName != None :
+ − 430 differentialExpression.setPlotterName(options.plotName)
+ − 431 differentialExpression.setPlotter()
+ − 432 differentialExpression.setOriented(not options.notOriented)
+ − 433 differentialExpression.setSimpleNormalization(options.simple)
+ − 434 differentialExpression.setSimpleNormalizationParameters(options.simpleParameters)
+ − 435 differentialExpression.setAdjustedNormalization(options.adjusted)
+ − 436 differentialExpression.setFixedSizeNormalization(options.fixedSizeFactor)
+ − 437 differentialExpression.setFdr(options.fdr)
+ − 438 differentialExpression.getDifferentialExpression()
+ − 439 differentialExpression.mySqlConnection.deleteDatabase()
+ − 440
+ − 441