Mercurial > repos > fcaramia > contra
view Contra/scripts/assign_bin_number_v2.py @ 3:94362f37962e
Uploaded
author | fcaramia |
---|---|
date | Thu, 13 Sep 2012 02:43:53 -0400 |
parents | 7564f3b1e675 |
children |
line wrap: on
line source
# ----------------------------------------------------------------------# # Copyright (c) 2011, Richard Lupat & Jason Li. # # > Source License < # This file is part of CONTRA. # # CONTRA is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # CONTRA is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with CONTRA. If not, see <http://www.gnu.org/licenses/>. # # #-----------------------------------------------------------------------# # Last Updated : 12 October 2011 16:43PM def assignBin(binNumber, srcFile, binFile, targetList, minExons): #def assignBin(binNumber, srcFile, binFile, minExons): from math import log src = file.readlines(open(srcFile)) #binOut = open(binFile, "w") minExons = int(minExons) count = 0 logcov_list = [] # Get the Log Coverage List for the normal sample for exon in src: exon = exon.split() cov1 = float(exon[7]) cov2 = float(exon[8]) cov = (cov1 + cov2) / 2 if (cov > 0): logcov = log(cov, 2) else: logcov = 0 logcov_list.append(logcov) #print "Len of logcov_list:", len(logcov_list) # Calculate the boundaries of the bins minLog = min(logcov_list) maxLog = max(logcov_list) boundary = (maxLog-minLog)/binNumber #print "Min, Max, Boundary, BinNumber: ", minLog, maxLog, boundary, binNumber # Split exons to different bins bin_list = [] boundary_dict = {} for logcov in logcov_list: i = 1 set_boundary = minLog + boundary while (logcov > set_boundary): i += 1 set_boundary = minLog + (boundary * i) #boundary_dict[i] = set_boundary bin_list.append(i) for i in range(binNumber+2): boundary_dict[i] = minLog + (boundary * i) # Check the number of exons for each bin # Merge with the adjacent bins if it is too small for z in range(1, binNumber+1): element = bin_list.count(z) #print "Bin", z, "has", element, "elements" if (element < minExons): while (bin_list.count(z) != 0): if (z != binNumber): bin_list[bin_list.index(z)]+=1 else: bin_list[bin_list.index(z)]-=1 # Check the number of exons in the last bin last_bin_number = sorted(set(bin_list))[-1] if len(set(bin_list)) > 1: second_last_bin = sorted(set(bin_list))[-2] else: second_last_bin = last_bin_number element = bin_list.count(last_bin_number) if (element < minExons): while (bin_list.count(last_bin_number) != 0): if (last_bin_number != 1): #bin_list[bin_list.index(last_bin_number)] -= 1 bin_list[bin_list.index(last_bin_number)] = second_last_bin final_number_of_bin = len(set(bin_list)) # List out the binning boundaries in a txt file boundary_list = [boundary_dict[x] for x in sorted(set(bin_list))] i = 1 #boundary_file = binFile + str(final_number_of_bin) + ".boundaries.txt" boundary_file = binFile + str(binNumber) + ".boundaries.txt" boOut = open(boundary_file, "w") boOut.write("\t".join([str(0), str(minLog)])+"\n") for z in boundary_list: if (i==final_number_of_bin): z = maxLog boOut.write("\t".join([str(i), str(z)])+"\n") i += 1 boOut.close() # Re-sort the bin numbers - get rid of gaps curr_z = 1 bin_number_dict = {} for z in sorted(set(bin_list)): bin_number_dict[z] = curr_z curr_z += 1 # Append the bin number to the original file #binFile = binFile + str(final_number_of_bin)+".txt" binFile = binFile + str(binNumber) + ".txt" binOut = open(binFile, "w") for exons in src: exon = exons.split() id = int(exon[0]) gene = exon[1] exonNumber = exon[5] target = targetList[int(id)-1] if target.id == id: chr = target.chr oriStart = target.start oriEnd = target.end else: print "ERROR..." f_bin_number = str(bin_number_dict[bin_list[count]]) binOut.write("\t".join([exons.strip("\n"), f_bin_number,chr, oriStart, oriEnd])) binOut.write("\n") count += 1 binOut.close() print "End of assign.bin.number.py with %s exons in %s bins" %(count, final_number_of_bin) return final_number_of_bin