| 
0
 | 
     1 # ----------------------------------------------------------------------#
 | 
| 
 | 
     2 # Copyright (c) 2011, Richard Lupat & Jason Li.
 | 
| 
 | 
     3 #
 | 
| 
 | 
     4 # > Source License <
 | 
| 
 | 
     5 # This file is part of CONTRA.
 | 
| 
 | 
     6 #
 | 
| 
 | 
     7 #    CONTRA is free software: you can redistribute it and/or modify
 | 
| 
 | 
     8 #    it under the terms of the GNU General Public License as published by
 | 
| 
 | 
     9 #    the Free Software Foundation, either version 3 of the License, or
 | 
| 
 | 
    10 #    (at your option) any later version.
 | 
| 
 | 
    11 #
 | 
| 
 | 
    12 #    CONTRA is distributed in the hope that it will be useful,
 | 
| 
 | 
    13 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
| 
 | 
    14 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
| 
 | 
    15 #    GNU General Public License for more details.
 | 
| 
 | 
    16 #
 | 
| 
 | 
    17 #    You should have received a copy of the GNU General Public License
 | 
| 
 | 
    18 #    along with CONTRA.  If not, see <http://www.gnu.org/licenses/>.
 | 
| 
 | 
    19 #
 | 
| 
 | 
    20 # 
 | 
| 
 | 
    21 #-----------------------------------------------------------------------#
 | 
| 
 | 
    22 # Last Updated : 28 Sep 2011 11:00AM
 | 
| 
 | 
    23 
 | 
| 
 | 
    24 import sys
 | 
| 
 | 
    25 import math
 | 
| 
 | 
    26 
 | 
| 
 | 
    27 def getAverage(list1):
 | 
| 
 | 
    28 	if len(list1) > 0:
 | 
| 
 | 
    29 		return float(sum(list1))/len(list1)
 | 
| 
 | 
    30 	
 | 
| 
 | 
    31 	return 0.0
 | 
| 
 | 
    32 
 | 
| 
 | 
    33 def getStdDev(list1, avg):
 | 
| 
 | 
    34 	var = 0.0
 | 
| 
 | 
    35 	for x in list1:
 | 
| 
 | 
    36 		var += (avg - x) ** 2
 | 
| 
 | 
    37 
 | 
| 
 | 
    38 	if (len(list1)-1) > 0:
 | 
| 
 | 
    39 		var /= (len(list1)-1)
 | 
| 
 | 
    40 
 | 
| 
 | 
    41 	return math.sqrt(var)
 | 
| 
 | 
    42 
 | 
| 
 | 
    43 def getMinMax(list1):
 | 
| 
 | 
    44 	length = len(list1)
 | 
| 
 | 
    45 	if length != 0:
 | 
| 
 | 
    46        		min = list1[0]
 | 
| 
 | 
    47         	max = list1[length-1]
 | 
| 
 | 
    48 	else:
 | 
| 
 | 
    49         	min = 0
 | 
| 
 | 
    50         	max = 0
 | 
| 
 | 
    51 
 | 
| 
 | 
    52 	return min, max
 | 
| 
 | 
    53 
 | 
| 
 | 
    54 def getMedian(list1):
 | 
| 
 | 
    55 	length = len(list1)
 | 
| 
 | 
    56 	if length == 0:
 | 
| 
 | 
    57         	median = 0
 | 
| 
 | 
    58 	elif length % 2 == 0:
 | 
| 
 | 
    59         	median = (list1[length/2]+list1[(length/2) - 1])/2
 | 
| 
 | 
    60 	else:
 | 
| 
 | 
    61         	median = list1[length/2]
 | 
| 
 | 
    62 	return median
 | 
| 
 | 
    63 
 | 
| 
 | 
    64 def createDataDict(count, list1, r, offset, id_check, exon_check):
 | 
| 
 | 
    65 	tDict 	 = {}
 | 
| 
 | 
    66 	tDictOri = {}
 | 
| 
 | 
    67 
 | 
| 
 | 
    68 	while count < len(list1):
 | 
| 
 | 
    69 		t 	= list1[count].split()
 | 
| 
 | 
    70 		tId     = t[5]
 | 
| 
 | 
    71 		tExon   = t[6]
 | 
| 
 | 
    72 
 | 
| 
 | 
    73 		if (tId != id_check) or (tExon != exon_check):
 | 
| 
 | 
    74        	 		return count, tDict, tDictOri
 | 
| 
 | 
    75 
 | 
| 
 | 
    76         	tStart  = int(t[2])
 | 
| 
 | 
    77        		tEnd    = int(t[3])
 | 
| 
 | 
    78 	        tCov    = float(t[4]) / r + offset       #GeoMean Normalisation
 | 
| 
 | 
    79         	tCovOri = float(t[4]) + offset     #without scaling
 | 
| 
 | 
    80 
 | 
| 
 | 
    81         	#filling dict
 | 
| 
 | 
    82         	while tStart < tEnd:
 | 
| 
 | 
    83                 	tDict[tStart] = tCov
 | 
| 
 | 
    84                 	tDictOri[tStart] = tCovOri #without scaling
 | 
| 
 | 
    85                 	tStart += 1
 | 
| 
 | 
    86 
 | 
| 
 | 
    87         	count += 1
 | 
| 
 | 
    88 
 | 
| 
 | 
    89 	return count, tDict, tDictOri
 | 
| 
 | 
    90 
 | 
| 
 | 
    91 def getFactor (val1, val2):
 | 
| 
 | 
    92 	r = math.sqrt(val1 * val2)
 | 
| 
 | 
    93 	r1 = val1/r
 | 
| 
 | 
    94 	r2 = val2/r
 | 
| 
 | 
    95 	return r1, r2
 | 
| 
 | 
    96 
 | 
| 
 | 
    97 def averageCount(tFile, nFile, averageOut, tReadCount, nReadCount, rd_threshold, minNBases):
 | 
| 
 | 
    98 	tList = file.readlines(open(tFile))
 | 
| 
 | 
    99 	nList = file.readlines(open(nFile))
 | 
| 
 | 
   100 	# constant & counter
 | 
| 
 | 
   101 	OFF    = 1
 | 
| 
 | 
   102 	tCount = 0
 | 
| 
 | 
   103 	nCount = 0
 | 
| 
 | 
   104 
 | 
| 
 | 
   105 	# create and open files
 | 
| 
 | 
   106 	output = open(averageOut, "w")
 | 
| 
 | 
   107 	
 | 
| 
 | 
   108 	# Offset and Ratio for Geometric Mean Normalisation
 | 
| 
 | 
   109 	r1, r2 = getFactor(tReadCount, nReadCount)	
 | 
| 
 | 
   110 	if rd_threshold > 0:
 | 
| 
 | 
   111 		#OFF = 0 
 | 
| 
 | 
   112 		OFF = 0.5
 | 
| 
 | 
   113 
 | 
| 
 | 
   114 	#big loop
 | 
| 
 | 
   115 	while (nCount < len(nList)):
 | 
| 
 | 
   116 		# initialisation, get the chr, geneID, geneName
 | 
| 
 | 
   117 		init	= tList[tCount].split()
 | 
| 
 | 
   118 		initial = init[5] 
 | 
| 
 | 
   119 		_exon 	= init[6]
 | 
| 
 | 
   120 		chr 	= init[1]
 | 
| 
 | 
   121 		gene 	= init[0]
 | 
| 
 | 
   122 		_start 	= int(init[2])
 | 
| 
 | 
   123 
 | 
| 
 | 
   124 		# check if t-gene and n-gene refer to the same gene
 | 
| 
 | 
   125 		check_init = nList[nCount].split()
 | 
| 
 | 
   126 		if check_init[5] != initial or check_init[6] != _exon:
 | 
| 
 | 
   127 			print "Initial: ", initial
 | 
| 
 | 
   128 			print "Check_Init.id: ", check_init[5]
 | 
| 
 | 
   129 			print "_Exon: ", _exon
 | 
| 
 | 
   130 			print "Check_Init.exon: ", check_init[6]
 | 
| 
 | 
   131 			print "Error. Comparing different Gene"
 | 
| 
 | 
   132 			sys.exit(1)
 | 
| 
 | 
   133 		
 | 
| 
 | 
   134 		# create data dictionary for tumour and normal data (per each regions/ exon)
 | 
| 
 | 
   135 		tCount, tDict, tDictOri = createDataDict(tCount, tList, r1, OFF, initial, _exon)
 | 
| 
 | 
   136 		nCount, nDict, nDictOri = createDataDict(nCount, nList, r2, OFF, initial, _exon)		
 | 
| 
 | 
   137 		# check number of bases in the both gene dict
 | 
| 
 | 
   138 		if len(nDict) != len(tDict):
 | 
| 
 | 
   139 			print "N:", len(nDict)
 | 
| 
 | 
   140 			print "T:", len(tDict)
 | 
| 
 | 
   141 			print "Error. Different length of dict"
 | 
| 
 | 
   142 			sys.exit(1)
 | 
| 
 | 
   143 
 | 
| 
 | 
   144 		# compare coverage
 | 
| 
 | 
   145 		count = _start
 | 
| 
 | 
   146 		_max = max(nDict.keys())
 | 
| 
 | 
   147 		ratioList = []
 | 
| 
 | 
   148 		tumourList = []
 | 
| 
 | 
   149 		normalList = []
 | 
| 
 | 
   150 		tumourOriList = []
 | 
| 
 | 
   151 		normalOriList = []
 | 
| 
 | 
   152 		while count <= _max:
 | 
| 
 | 
   153 			# get ratio
 | 
| 
 | 
   154 			if (nDict[count] < rd_threshold) and (tDict[count] < rd_threshold):
 | 
| 
 | 
   155 				ratio = 0.0
 | 
| 
 | 
   156 			else:
 | 
| 
 | 
   157 				if tDict[count] == 0:
 | 
| 
 | 
   158 					tDict[count] = 0.5
 | 
| 
 | 
   159 
 | 
| 
 | 
   160 				ratio = math.log((float(tDict[count]) / nDict[count]),2)
 | 
| 
 | 
   161 				tumourList.append(tDict[count])
 | 
| 
 | 
   162 				tumourOriList.append(tDictOri[count])
 | 
| 
 | 
   163 				normalList.append(nDict[count])
 | 
| 
 | 
   164 				normalOriList.append(nDictOri[count])
 | 
| 
 | 
   165 				ratioList.append(ratio)
 | 
| 
 | 
   166 		
 | 
| 
 | 
   167 			count += 1
 | 
| 
 | 
   168 
 | 
| 
 | 
   169 		ratioLen = len(ratioList)
 | 
| 
 | 
   170 
 | 
| 
 | 
   171 		# get average
 | 
| 
 | 
   172 		avg = getAverage(ratioList)
 | 
| 
 | 
   173 		sd = getStdDev(ratioList, avg)
 | 
| 
 | 
   174 		tumourAvg= str(round(getAverage(tumourList),3))
 | 
| 
 | 
   175 		normalAvg= str(round(getAverage(normalList),3))
 | 
| 
 | 
   176 		tumourOriAvg = str(round(getAverage(tumourOriList),3))
 | 
| 
 | 
   177 		normalOriAvg = str(round(getAverage(normalOriList),3))
 | 
| 
 | 
   178 
 | 
| 
 | 
   179 		# get median
 | 
| 
 | 
   180 		ratioList.sort()
 | 
| 
 | 
   181 		min_logratio, max_logratio = getMinMax(ratioList)
 | 
| 
 | 
   182 		median	= getMedian(ratioList)
 | 
| 
 | 
   183 
 | 
| 
 | 
   184 		# write output
 | 
| 
 | 
   185 		if ratioLen >= minNBases:
 | 
| 
 | 
   186 			output.write(initial + "\t" + gene + "\t" + str(ratioLen) + "\t")
 | 
| 
 | 
   187 			output.write(str(round(avg,3))+ "\t"+ str(count)+ "\t" + _exon + "\t")
 | 
| 
 | 
   188 			output.write(str(round(sd ,3))+ "\t"+ tumourAvg + "\t" + normalAvg +"\t") 
 | 
| 
 | 
   189 			output.write(tumourOriAvg + "\t" + normalOriAvg + "\t")
 | 
| 
 | 
   190 			output.write(str(round(median,3)) + "\t" + str(round(min_logratio,3)) + "\t")
 | 
| 
 | 
   191 			output.write(str(round(max_logratio,3)) + "\n")
 | 
| 
 | 
   192 
 | 
| 
 | 
   193 	output.close()
 | 
| 
 | 
   194 
 | 
| 
 | 
   195 	#print "End of averageCount.py with the last target = '%s'" %(initial) 
 |