Mercurial > repos > fcaramia > contra
view Contra/scripts/average_count.py @ 0:7564f3b1e675
Uploaded
author | fcaramia |
---|---|
date | Thu, 13 Sep 2012 02:31:43 -0400 |
parents | |
children |
line wrap: on
line source
# ----------------------------------------------------------------------# # Copyright (c) 2011, Richard Lupat & Jason Li. # # > Source License < # This file is part of CONTRA. # # CONTRA is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # CONTRA is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with CONTRA. If not, see <http://www.gnu.org/licenses/>. # # #-----------------------------------------------------------------------# # Last Updated : 28 Sep 2011 11:00AM import sys import math def getAverage(list1): if len(list1) > 0: return float(sum(list1))/len(list1) return 0.0 def getStdDev(list1, avg): var = 0.0 for x in list1: var += (avg - x) ** 2 if (len(list1)-1) > 0: var /= (len(list1)-1) return math.sqrt(var) def getMinMax(list1): length = len(list1) if length != 0: min = list1[0] max = list1[length-1] else: min = 0 max = 0 return min, max def getMedian(list1): length = len(list1) if length == 0: median = 0 elif length % 2 == 0: median = (list1[length/2]+list1[(length/2) - 1])/2 else: median = list1[length/2] return median def createDataDict(count, list1, r, offset, id_check, exon_check): tDict = {} tDictOri = {} while count < len(list1): t = list1[count].split() tId = t[5] tExon = t[6] if (tId != id_check) or (tExon != exon_check): return count, tDict, tDictOri tStart = int(t[2]) tEnd = int(t[3]) tCov = float(t[4]) / r + offset #GeoMean Normalisation tCovOri = float(t[4]) + offset #without scaling #filling dict while tStart < tEnd: tDict[tStart] = tCov tDictOri[tStart] = tCovOri #without scaling tStart += 1 count += 1 return count, tDict, tDictOri def getFactor (val1, val2): r = math.sqrt(val1 * val2) r1 = val1/r r2 = val2/r return r1, r2 def averageCount(tFile, nFile, averageOut, tReadCount, nReadCount, rd_threshold, minNBases): tList = file.readlines(open(tFile)) nList = file.readlines(open(nFile)) # constant & counter OFF = 1 tCount = 0 nCount = 0 # create and open files output = open(averageOut, "w") # Offset and Ratio for Geometric Mean Normalisation r1, r2 = getFactor(tReadCount, nReadCount) if rd_threshold > 0: #OFF = 0 OFF = 0.5 #big loop while (nCount < len(nList)): # initialisation, get the chr, geneID, geneName init = tList[tCount].split() initial = init[5] _exon = init[6] chr = init[1] gene = init[0] _start = int(init[2]) # check if t-gene and n-gene refer to the same gene check_init = nList[nCount].split() if check_init[5] != initial or check_init[6] != _exon: print "Initial: ", initial print "Check_Init.id: ", check_init[5] print "_Exon: ", _exon print "Check_Init.exon: ", check_init[6] print "Error. Comparing different Gene" sys.exit(1) # create data dictionary for tumour and normal data (per each regions/ exon) tCount, tDict, tDictOri = createDataDict(tCount, tList, r1, OFF, initial, _exon) nCount, nDict, nDictOri = createDataDict(nCount, nList, r2, OFF, initial, _exon) # check number of bases in the both gene dict if len(nDict) != len(tDict): print "N:", len(nDict) print "T:", len(tDict) print "Error. Different length of dict" sys.exit(1) # compare coverage count = _start _max = max(nDict.keys()) ratioList = [] tumourList = [] normalList = [] tumourOriList = [] normalOriList = [] while count <= _max: # get ratio if (nDict[count] < rd_threshold) and (tDict[count] < rd_threshold): ratio = 0.0 else: if tDict[count] == 0: tDict[count] = 0.5 ratio = math.log((float(tDict[count]) / nDict[count]),2) tumourList.append(tDict[count]) tumourOriList.append(tDictOri[count]) normalList.append(nDict[count]) normalOriList.append(nDictOri[count]) ratioList.append(ratio) count += 1 ratioLen = len(ratioList) # get average avg = getAverage(ratioList) sd = getStdDev(ratioList, avg) tumourAvg= str(round(getAverage(tumourList),3)) normalAvg= str(round(getAverage(normalList),3)) tumourOriAvg = str(round(getAverage(tumourOriList),3)) normalOriAvg = str(round(getAverage(normalOriList),3)) # get median ratioList.sort() min_logratio, max_logratio = getMinMax(ratioList) median = getMedian(ratioList) # write output if ratioLen >= minNBases: output.write(initial + "\t" + gene + "\t" + str(ratioLen) + "\t") output.write(str(round(avg,3))+ "\t"+ str(count)+ "\t" + _exon + "\t") output.write(str(round(sd ,3))+ "\t"+ tumourAvg + "\t" + normalAvg +"\t") output.write(tumourOriAvg + "\t" + normalOriAvg + "\t") output.write(str(round(median,3)) + "\t" + str(round(min_logratio,3)) + "\t") output.write(str(round(max_logratio,3)) + "\n") output.close() #print "End of averageCount.py with the last target = '%s'" %(initial)