diff Contra/scripts/assign_bin_number_v2.py @ 0:7564f3b1e675

Uploaded
author fcaramia
date Thu, 13 Sep 2012 02:31:43 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Contra/scripts/assign_bin_number_v2.py	Thu Sep 13 02:31:43 2012 -0400
@@ -0,0 +1,153 @@
+# ----------------------------------------------------------------------#
+# Copyright (c) 2011, Richard Lupat & Jason Li.
+#
+# > Source License <
+# This file is part of CONTRA.
+#
+#    CONTRA is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+#
+#    CONTRA is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with CONTRA.  If not, see <http://www.gnu.org/licenses/>.
+#
+# 
+#-----------------------------------------------------------------------#
+# Last Updated : 12 October 2011 16:43PM
+
+
+def assignBin(binNumber, srcFile, binFile, targetList, minExons):
+#def assignBin(binNumber, srcFile, binFile, minExons):
+	from math import log
+
+	src = file.readlines(open(srcFile))
+	#binOut = open(binFile, "w")
+
+	minExons = int(minExons)
+	count = 0
+	logcov_list = []
+
+	# Get the Log Coverage List for the normal sample
+	for exon in src:
+		exon 	= exon.split()
+		cov1	= float(exon[7])
+		cov2	= float(exon[8])
+		cov	= (cov1 + cov2) / 2
+		if (cov > 0):
+			logcov	= log(cov, 2)
+		else:
+			logcov	= 0
+		logcov_list.append(logcov)
+	#print "Len of logcov_list:", len(logcov_list)
+
+	# Calculate the boundaries of the bins
+	minLog 		= min(logcov_list)
+	maxLog 		= max(logcov_list)
+	boundary	= (maxLog-minLog)/binNumber
+	#print "Min, Max, Boundary, BinNumber: ", minLog, maxLog, boundary, binNumber
+
+
+	# Split exons to different bins
+	bin_list = []
+	boundary_dict = {}
+	for logcov in logcov_list:
+		i = 1
+		set_boundary = minLog + boundary
+		while (logcov > set_boundary):
+			i += 1
+			set_boundary = minLog + (boundary * i)
+		#boundary_dict[i] = set_boundary
+		bin_list.append(i)
+
+	for i in range(binNumber+2):
+		boundary_dict[i] = minLog + (boundary * i)
+
+
+	# Check the number of exons for each bin
+	# Merge with the adjacent bins if it is too small
+	for z in range(1, binNumber+1):
+		element = bin_list.count(z)
+		#print "Bin", z, "has", element, "elements"
+		if (element < minExons):
+			while (bin_list.count(z) != 0):
+				if (z != binNumber):
+					bin_list[bin_list.index(z)]+=1 
+				else:
+					bin_list[bin_list.index(z)]-=1
+
+
+	# Check the number of exons in the last bin
+	last_bin_number = sorted(set(bin_list))[-1]
+	if len(set(bin_list)) > 1:
+		second_last_bin	= sorted(set(bin_list))[-2]
+	else:
+		second_last_bin = last_bin_number
+	element		= bin_list.count(last_bin_number)
+	if (element < minExons):
+		while (bin_list.count(last_bin_number) != 0):
+			if (last_bin_number != 1):
+				#bin_list[bin_list.index(last_bin_number)] -= 1
+				bin_list[bin_list.index(last_bin_number)] = second_last_bin
+
+	final_number_of_bin = len(set(bin_list))
+
+	# List out the binning boundaries in a txt file
+	boundary_list = [boundary_dict[x] for x in sorted(set(bin_list))]
+	i = 1
+
+	#boundary_file = binFile + str(final_number_of_bin) + ".boundaries.txt"
+	boundary_file = binFile + str(binNumber) + ".boundaries.txt"
+	boOut	= open(boundary_file, "w")
+	boOut.write("\t".join([str(0), str(minLog)])+"\n")
+	for z in boundary_list:
+		if (i==final_number_of_bin):
+			z = maxLog
+		boOut.write("\t".join([str(i), str(z)])+"\n")
+		i += 1	
+	boOut.close()
+
+	# Re-sort the bin numbers - get rid of gaps 
+	curr_z = 1
+	bin_number_dict = {}
+	for z in sorted(set(bin_list)):
+		bin_number_dict[z] = curr_z
+		curr_z += 1
+
+
+	# Append the bin number to the original file 
+	#binFile = binFile + str(final_number_of_bin)+".txt"
+	binFile	= binFile + str(binNumber) + ".txt"
+	binOut	= open(binFile, "w")
+	
+	for exons in src:
+		exon 	= exons.split()
+		id	= int(exon[0])
+		gene	= exon[1]
+		exonNumber	= exon[5]
+		
+		target	= targetList[int(id)-1]
+		if target.id == id:
+			chr		= target.chr
+			oriStart	= target.start
+			oriEnd		= target.end
+
+		else:
+			print "ERROR..."
+
+		f_bin_number = str(bin_number_dict[bin_list[count]])
+		binOut.write("\t".join([exons.strip("\n"), f_bin_number,chr, oriStart, oriEnd]))
+		binOut.write("\n")
+		count += 1
+
+	binOut.close()
+	print "End of assign.bin.number.py with %s exons in %s bins" %(count, final_number_of_bin)
+
+
+	return final_number_of_bin
+