diff Contra/scripts/average_count.py @ 0:7564f3b1e675

Uploaded
author fcaramia
date Thu, 13 Sep 2012 02:31:43 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Contra/scripts/average_count.py	Thu Sep 13 02:31:43 2012 -0400
@@ -0,0 +1,195 @@
+# ----------------------------------------------------------------------#
+# Copyright (c) 2011, Richard Lupat & Jason Li.
+#
+# > Source License <
+# This file is part of CONTRA.
+#
+#    CONTRA is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+#
+#    CONTRA is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with CONTRA.  If not, see <http://www.gnu.org/licenses/>.
+#
+# 
+#-----------------------------------------------------------------------#
+# Last Updated : 28 Sep 2011 11:00AM
+
+import sys
+import math
+
+def getAverage(list1):
+	if len(list1) > 0:
+		return float(sum(list1))/len(list1)
+	
+	return 0.0
+
+def getStdDev(list1, avg):
+	var = 0.0
+	for x in list1:
+		var += (avg - x) ** 2
+
+	if (len(list1)-1) > 0:
+		var /= (len(list1)-1)
+
+	return math.sqrt(var)
+
+def getMinMax(list1):
+	length = len(list1)
+	if length != 0:
+       		min = list1[0]
+        	max = list1[length-1]
+	else:
+        	min = 0
+        	max = 0
+
+	return min, max
+
+def getMedian(list1):
+	length = len(list1)
+	if length == 0:
+        	median = 0
+	elif length % 2 == 0:
+        	median = (list1[length/2]+list1[(length/2) - 1])/2
+	else:
+        	median = list1[length/2]
+	return median
+
+def createDataDict(count, list1, r, offset, id_check, exon_check):
+	tDict 	 = {}
+	tDictOri = {}
+
+	while count < len(list1):
+		t 	= list1[count].split()
+		tId     = t[5]
+		tExon   = t[6]
+
+		if (tId != id_check) or (tExon != exon_check):
+       	 		return count, tDict, tDictOri
+
+        	tStart  = int(t[2])
+       		tEnd    = int(t[3])
+	        tCov    = float(t[4]) / r + offset       #GeoMean Normalisation
+        	tCovOri = float(t[4]) + offset     #without scaling
+
+        	#filling dict
+        	while tStart < tEnd:
+                	tDict[tStart] = tCov
+                	tDictOri[tStart] = tCovOri #without scaling
+                	tStart += 1
+
+        	count += 1
+
+	return count, tDict, tDictOri
+
+def getFactor (val1, val2):
+	r = math.sqrt(val1 * val2)
+	r1 = val1/r
+	r2 = val2/r
+	return r1, r2
+
+def averageCount(tFile, nFile, averageOut, tReadCount, nReadCount, rd_threshold, minNBases):
+	tList = file.readlines(open(tFile))
+	nList = file.readlines(open(nFile))
+	# constant & counter
+	OFF    = 1
+	tCount = 0
+	nCount = 0
+
+	# create and open files
+	output = open(averageOut, "w")
+	
+	# Offset and Ratio for Geometric Mean Normalisation
+	r1, r2 = getFactor(tReadCount, nReadCount)	
+	if rd_threshold > 0:
+		#OFF = 0 
+		OFF = 0.5
+
+	#big loop
+	while (nCount < len(nList)):
+		# initialisation, get the chr, geneID, geneName
+		init	= tList[tCount].split()
+		initial = init[5] 
+		_exon 	= init[6]
+		chr 	= init[1]
+		gene 	= init[0]
+		_start 	= int(init[2])
+
+		# check if t-gene and n-gene refer to the same gene
+		check_init = nList[nCount].split()
+		if check_init[5] != initial or check_init[6] != _exon:
+			print "Initial: ", initial
+			print "Check_Init.id: ", check_init[5]
+			print "_Exon: ", _exon
+			print "Check_Init.exon: ", check_init[6]
+			print "Error. Comparing different Gene"
+			sys.exit(1)
+		
+		# create data dictionary for tumour and normal data (per each regions/ exon)
+		tCount, tDict, tDictOri = createDataDict(tCount, tList, r1, OFF, initial, _exon)
+		nCount, nDict, nDictOri = createDataDict(nCount, nList, r2, OFF, initial, _exon)		
+		# check number of bases in the both gene dict
+		if len(nDict) != len(tDict):
+			print "N:", len(nDict)
+			print "T:", len(tDict)
+			print "Error. Different length of dict"
+			sys.exit(1)
+
+		# compare coverage
+		count = _start
+		_max = max(nDict.keys())
+		ratioList = []
+		tumourList = []
+		normalList = []
+		tumourOriList = []
+		normalOriList = []
+		while count <= _max:
+			# get ratio
+			if (nDict[count] < rd_threshold) and (tDict[count] < rd_threshold):
+				ratio = 0.0
+			else:
+				if tDict[count] == 0:
+					tDict[count] = 0.5
+
+				ratio = math.log((float(tDict[count]) / nDict[count]),2)
+				tumourList.append(tDict[count])
+				tumourOriList.append(tDictOri[count])
+				normalList.append(nDict[count])
+				normalOriList.append(nDictOri[count])
+				ratioList.append(ratio)
+		
+			count += 1
+
+		ratioLen = len(ratioList)
+
+		# get average
+		avg = getAverage(ratioList)
+		sd = getStdDev(ratioList, avg)
+		tumourAvg= str(round(getAverage(tumourList),3))
+		normalAvg= str(round(getAverage(normalList),3))
+		tumourOriAvg = str(round(getAverage(tumourOriList),3))
+		normalOriAvg = str(round(getAverage(normalOriList),3))
+
+		# get median
+		ratioList.sort()
+		min_logratio, max_logratio = getMinMax(ratioList)
+		median	= getMedian(ratioList)
+
+		# write output
+		if ratioLen >= minNBases:
+			output.write(initial + "\t" + gene + "\t" + str(ratioLen) + "\t")
+			output.write(str(round(avg,3))+ "\t"+ str(count)+ "\t" + _exon + "\t")
+			output.write(str(round(sd ,3))+ "\t"+ tumourAvg + "\t" + normalAvg +"\t") 
+			output.write(tumourOriAvg + "\t" + normalOriAvg + "\t")
+			output.write(str(round(median,3)) + "\t" + str(round(min_logratio,3)) + "\t")
+			output.write(str(round(max_logratio,3)) + "\n")
+
+	output.close()
+
+	#print "End of averageCount.py with the last target = '%s'" %(initial)