Mercurial > repos > fcaramia > contra
diff Contra/scripts/assign_bin_number_v2.py @ 0:7564f3b1e675
Uploaded
author | fcaramia |
---|---|
date | Thu, 13 Sep 2012 02:31:43 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Contra/scripts/assign_bin_number_v2.py Thu Sep 13 02:31:43 2012 -0400 @@ -0,0 +1,153 @@ +# ----------------------------------------------------------------------# +# Copyright (c) 2011, Richard Lupat & Jason Li. +# +# > Source License < +# This file is part of CONTRA. +# +# CONTRA is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# CONTRA is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with CONTRA. If not, see <http://www.gnu.org/licenses/>. +# +# +#-----------------------------------------------------------------------# +# Last Updated : 12 October 2011 16:43PM + + +def assignBin(binNumber, srcFile, binFile, targetList, minExons): +#def assignBin(binNumber, srcFile, binFile, minExons): + from math import log + + src = file.readlines(open(srcFile)) + #binOut = open(binFile, "w") + + minExons = int(minExons) + count = 0 + logcov_list = [] + + # Get the Log Coverage List for the normal sample + for exon in src: + exon = exon.split() + cov1 = float(exon[7]) + cov2 = float(exon[8]) + cov = (cov1 + cov2) / 2 + if (cov > 0): + logcov = log(cov, 2) + else: + logcov = 0 + logcov_list.append(logcov) + #print "Len of logcov_list:", len(logcov_list) + + # Calculate the boundaries of the bins + minLog = min(logcov_list) + maxLog = max(logcov_list) + boundary = (maxLog-minLog)/binNumber + #print "Min, Max, Boundary, BinNumber: ", minLog, maxLog, boundary, binNumber + + + # Split exons to different bins + bin_list = [] + boundary_dict = {} + for logcov in logcov_list: + i = 1 + set_boundary = minLog + boundary + while (logcov > set_boundary): + i += 1 + set_boundary = minLog + (boundary * i) + #boundary_dict[i] = set_boundary + bin_list.append(i) + + for i in range(binNumber+2): + boundary_dict[i] = minLog + (boundary * i) + + + # Check the number of exons for each bin + # Merge with the adjacent bins if it is too small + for z in range(1, binNumber+1): + element = bin_list.count(z) + #print "Bin", z, "has", element, "elements" + if (element < minExons): + while (bin_list.count(z) != 0): + if (z != binNumber): + bin_list[bin_list.index(z)]+=1 + else: + bin_list[bin_list.index(z)]-=1 + + + # Check the number of exons in the last bin + last_bin_number = sorted(set(bin_list))[-1] + if len(set(bin_list)) > 1: + second_last_bin = sorted(set(bin_list))[-2] + else: + second_last_bin = last_bin_number + element = bin_list.count(last_bin_number) + if (element < minExons): + while (bin_list.count(last_bin_number) != 0): + if (last_bin_number != 1): + #bin_list[bin_list.index(last_bin_number)] -= 1 + bin_list[bin_list.index(last_bin_number)] = second_last_bin + + final_number_of_bin = len(set(bin_list)) + + # List out the binning boundaries in a txt file + boundary_list = [boundary_dict[x] for x in sorted(set(bin_list))] + i = 1 + + #boundary_file = binFile + str(final_number_of_bin) + ".boundaries.txt" + boundary_file = binFile + str(binNumber) + ".boundaries.txt" + boOut = open(boundary_file, "w") + boOut.write("\t".join([str(0), str(minLog)])+"\n") + for z in boundary_list: + if (i==final_number_of_bin): + z = maxLog + boOut.write("\t".join([str(i), str(z)])+"\n") + i += 1 + boOut.close() + + # Re-sort the bin numbers - get rid of gaps + curr_z = 1 + bin_number_dict = {} + for z in sorted(set(bin_list)): + bin_number_dict[z] = curr_z + curr_z += 1 + + + # Append the bin number to the original file + #binFile = binFile + str(final_number_of_bin)+".txt" + binFile = binFile + str(binNumber) + ".txt" + binOut = open(binFile, "w") + + for exons in src: + exon = exons.split() + id = int(exon[0]) + gene = exon[1] + exonNumber = exon[5] + + target = targetList[int(id)-1] + if target.id == id: + chr = target.chr + oriStart = target.start + oriEnd = target.end + + else: + print "ERROR..." + + f_bin_number = str(bin_number_dict[bin_list[count]]) + binOut.write("\t".join([exons.strip("\n"), f_bin_number,chr, oriStart, oriEnd])) + binOut.write("\n") + count += 1 + + binOut.close() + print "End of assign.bin.number.py with %s exons in %s bins" %(count, final_number_of_bin) + + + return final_number_of_bin +