Mercurial > repos > fcaramia > contra
diff Contra/scripts/average_count.py @ 0:7564f3b1e675
Uploaded
author | fcaramia |
---|---|
date | Thu, 13 Sep 2012 02:31:43 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Contra/scripts/average_count.py Thu Sep 13 02:31:43 2012 -0400 @@ -0,0 +1,195 @@ +# ----------------------------------------------------------------------# +# Copyright (c) 2011, Richard Lupat & Jason Li. +# +# > Source License < +# This file is part of CONTRA. +# +# CONTRA is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# CONTRA is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with CONTRA. If not, see <http://www.gnu.org/licenses/>. +# +# +#-----------------------------------------------------------------------# +# Last Updated : 28 Sep 2011 11:00AM + +import sys +import math + +def getAverage(list1): + if len(list1) > 0: + return float(sum(list1))/len(list1) + + return 0.0 + +def getStdDev(list1, avg): + var = 0.0 + for x in list1: + var += (avg - x) ** 2 + + if (len(list1)-1) > 0: + var /= (len(list1)-1) + + return math.sqrt(var) + +def getMinMax(list1): + length = len(list1) + if length != 0: + min = list1[0] + max = list1[length-1] + else: + min = 0 + max = 0 + + return min, max + +def getMedian(list1): + length = len(list1) + if length == 0: + median = 0 + elif length % 2 == 0: + median = (list1[length/2]+list1[(length/2) - 1])/2 + else: + median = list1[length/2] + return median + +def createDataDict(count, list1, r, offset, id_check, exon_check): + tDict = {} + tDictOri = {} + + while count < len(list1): + t = list1[count].split() + tId = t[5] + tExon = t[6] + + if (tId != id_check) or (tExon != exon_check): + return count, tDict, tDictOri + + tStart = int(t[2]) + tEnd = int(t[3]) + tCov = float(t[4]) / r + offset #GeoMean Normalisation + tCovOri = float(t[4]) + offset #without scaling + + #filling dict + while tStart < tEnd: + tDict[tStart] = tCov + tDictOri[tStart] = tCovOri #without scaling + tStart += 1 + + count += 1 + + return count, tDict, tDictOri + +def getFactor (val1, val2): + r = math.sqrt(val1 * val2) + r1 = val1/r + r2 = val2/r + return r1, r2 + +def averageCount(tFile, nFile, averageOut, tReadCount, nReadCount, rd_threshold, minNBases): + tList = file.readlines(open(tFile)) + nList = file.readlines(open(nFile)) + # constant & counter + OFF = 1 + tCount = 0 + nCount = 0 + + # create and open files + output = open(averageOut, "w") + + # Offset and Ratio for Geometric Mean Normalisation + r1, r2 = getFactor(tReadCount, nReadCount) + if rd_threshold > 0: + #OFF = 0 + OFF = 0.5 + + #big loop + while (nCount < len(nList)): + # initialisation, get the chr, geneID, geneName + init = tList[tCount].split() + initial = init[5] + _exon = init[6] + chr = init[1] + gene = init[0] + _start = int(init[2]) + + # check if t-gene and n-gene refer to the same gene + check_init = nList[nCount].split() + if check_init[5] != initial or check_init[6] != _exon: + print "Initial: ", initial + print "Check_Init.id: ", check_init[5] + print "_Exon: ", _exon + print "Check_Init.exon: ", check_init[6] + print "Error. Comparing different Gene" + sys.exit(1) + + # create data dictionary for tumour and normal data (per each regions/ exon) + tCount, tDict, tDictOri = createDataDict(tCount, tList, r1, OFF, initial, _exon) + nCount, nDict, nDictOri = createDataDict(nCount, nList, r2, OFF, initial, _exon) + # check number of bases in the both gene dict + if len(nDict) != len(tDict): + print "N:", len(nDict) + print "T:", len(tDict) + print "Error. Different length of dict" + sys.exit(1) + + # compare coverage + count = _start + _max = max(nDict.keys()) + ratioList = [] + tumourList = [] + normalList = [] + tumourOriList = [] + normalOriList = [] + while count <= _max: + # get ratio + if (nDict[count] < rd_threshold) and (tDict[count] < rd_threshold): + ratio = 0.0 + else: + if tDict[count] == 0: + tDict[count] = 0.5 + + ratio = math.log((float(tDict[count]) / nDict[count]),2) + tumourList.append(tDict[count]) + tumourOriList.append(tDictOri[count]) + normalList.append(nDict[count]) + normalOriList.append(nDictOri[count]) + ratioList.append(ratio) + + count += 1 + + ratioLen = len(ratioList) + + # get average + avg = getAverage(ratioList) + sd = getStdDev(ratioList, avg) + tumourAvg= str(round(getAverage(tumourList),3)) + normalAvg= str(round(getAverage(normalList),3)) + tumourOriAvg = str(round(getAverage(tumourOriList),3)) + normalOriAvg = str(round(getAverage(normalOriList),3)) + + # get median + ratioList.sort() + min_logratio, max_logratio = getMinMax(ratioList) + median = getMedian(ratioList) + + # write output + if ratioLen >= minNBases: + output.write(initial + "\t" + gene + "\t" + str(ratioLen) + "\t") + output.write(str(round(avg,3))+ "\t"+ str(count)+ "\t" + _exon + "\t") + output.write(str(round(sd ,3))+ "\t"+ tumourAvg + "\t" + normalAvg +"\t") + output.write(tumourOriAvg + "\t" + normalOriAvg + "\t") + output.write(str(round(median,3)) + "\t" + str(round(min_logratio,3)) + "\t") + output.write(str(round(max_logratio,3)) + "\n") + + output.close() + + #print "End of averageCount.py with the last target = '%s'" %(initial)