Mercurial > repos > xuebing > sharplabtool
view tools/ilmn_pacbio/cov_model.py @ 1:cdcb0ce84a1b
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:45:15 -0500 |
parents | 9071e359b9a3 |
children |
line wrap: on
line source
#!/usr/bin/env python from optparse import OptionParser, SUPPRESS_HELP import os, random, quake ############################################################ # cov_model.py # # Given a file of kmer counts, reports the cutoff to use # to separate trusted/untrusted kmers. ############################################################ ############################################################ # main ############################################################ def main(): usage = 'usage: %prog [options] <counts file>' parser = OptionParser(usage) parser.add_option('--int', dest='count_kmers', action='store_true', default=False, help='Kmers were counted as integers w/o the use of quality values [default: %default]') parser.add_option('--ratio', dest='ratio', type='int', default=200, help='Likelihood ratio to set trusted/untrusted cutoff [default: %default]') parser.add_option('--no_sample', dest='no_sample', action='store_true', default=False, help='Do not sample kmer coverages into kmers.txt because its already done [default: %default]') # help='Model kmer coverage as a function of GC content of kmers [default: %default]' parser.add_option('--gc', dest='model_gc', action='store_true', default=False, help=SUPPRESS_HELP) (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide kmers counts file') else: ctsf = args[0] if options.count_kmers: model_cutoff(ctsf, options.ratio) print 'Cutoff: %s' % open('cutoff.txt').readline().rstrip() else: if options.model_gc: model_q_gc_cutoffs(ctsf, 25000, options.ratio) else: model_q_cutoff(ctsf, 50000, options.ratio, options.no_sample) print 'Cutoff: %s' % open('cutoff.txt').readline().rstrip() ############################################################ # model_cutoff # # Make a histogram of kmers to give to R to learn the cutoff ############################################################ def model_cutoff(ctsf, ratio): # make kmer histogram cov_max = 0 for line in open(ctsf): cov = int(line.split()[1]) if cov > cov_max: cov_max = cov kmer_hist = [0]*cov_max for line in open(ctsf): cov = int(line.split()[1]) kmer_hist[cov-1] += 1 cov_out = open('kmers.hist', 'w') for cov in range(0,cov_max): if kmer_hist[cov]: print >> cov_out, '%d\t%d' % (cov+1,kmer_hist[cov]) cov_out.close() os.system('R --slave --args %d < %s/cov_model.r 2> r.log' % (ratio,quake.quake_dir)) ############################################################ # model_q_cutoff # # Sample kmers to give to R to learn the cutoff # 'div100' is necessary when the number of kmers is too # large for random.sample, so we only consider every 100th # kmer. ############################################################ def model_q_cutoff(ctsf, sample, ratio, no_sample=False): if not no_sample: # count number of kmer coverages num_covs = 0 for line in open(ctsf): num_covs += 1 # choose random kmer coverages div100 = False if sample >= num_covs: rand_covs = range(num_covs) else: if num_covs > 1000000000: div100 = True rand_covs = random.sample(xrange(num_covs/100), sample) else: rand_covs = random.sample(xrange(num_covs), sample) rand_covs.sort() # print to file out = open('kmers.txt', 'w') kmer_i = 0 rand_i = 0 for line in open(ctsf): if div100: if kmer_i % 100 == 0 and kmer_i/100 == rand_covs[rand_i]: print >> out, line.split()[1] rand_i += 1 if rand_i >= sample: break else: if kmer_i == rand_covs[rand_i]: print >> out, line.split()[1] rand_i += 1 if rand_i >= sample: break kmer_i += 1 out.close() os.system('R --slave --args %d < %s/cov_model_qmer.r 2> r.log' % (ratio,quake.quake_dir)) ############################################################ # model_q_gc_cutoffs # # Sample kmers to give to R to learn the cutoff for each # GC value ############################################################ def model_q_gc_cutoffs(ctsf, sample, ratio): # count number of kmer coverages at each at k = len(open(ctsf).readline().split()[0]) num_covs_at = [0]*(k+1) for line in open(ctsf): kmer = line.split()[0] num_covs_at[count_at(kmer)] += 1 # for each AT bin at_cutoffs = [] for at in range(1,k): # sample covs if sample >= num_covs_at[at]: rand_covs = range(num_covs_at[at]) else: rand_covs = random.sample(xrange(num_covs_at[at]), sample) rand_covs.sort() # print to file out = open('kmers.txt', 'w') kmer_i = 0 rand_i = 0 for line in open(ctsf): (kmer,cov) = line.split() if count_at(kmer) == at: if kmer_i == rand_covs[rand_i]: print >> out, cov rand_i += 1 if rand_i >= sample: break kmer_i += 1 out.close() os.system('R --slave --args %d < %s/cov_model_qmer.r 2> r%d.log' % (ratio,quake.quake_dir,at)) at_cutoffs.append( open('cutoff.txt').readline().rstrip() ) if at in [1,k-1]: # setting extremes to next closests at_cutoffs.append( open('cutoff.txt').readline().rstrip() ) os.system('mv kmers.txt kmers.at%d.txt' % at) os.system('mv cutoff.txt cutoff.at%d.txt' % at) out = open('cutoffs.gc.txt','w') print >> out, '\n'.join(at_cutoffs) out.close() ############################################################ # model_q_gc_cutoffs_bigmem # # Sample kmers to give to R to learn the cutoff for each # GC value ############################################################ def model_q_gc_cutoffs_bigmem(ctsf, sample, ratio): # input coverages k = 0 for line in open(ctsf): (kmer,cov) = line.split() if k == 0: k = len(kmer) at_covs = ['']*(k+1) else: at = count_at(kmer) if at_covs[at]: at_covs[at].append(cov) else: at_covs[at] = [cov] for at in range(1,k): print '%d %d' % (at,len(at_covs[at])) # for each AT bin at_cutoffs = [] for at in range(1,k): # sample covs if sample >= len(at_covs[at]): rand_covs = at_covs[at] else: rand_covs = random.sample(at_covs[at], sample) # print to file out = open('kmers.txt', 'w') for rc in rand_covs: print >> out, rc out.close() os.system('R --slave --args %d < %s/cov_model_qmer.r 2> r%d.log' % (ratio,quake.quake_dir,at)) at_cutoffs.append( open('cutoff.txt').readline().rstrip() ) if at in [1,k-1]: # setting extremes to next closests at_cutoffs.append( open('cutoff.txt').readline().rstrip() ) os.system('mv kmers.txt kmers.at%d.txt' % at) os.system('mv cutoff.txt cutoff.at%d.txt' % at) out = open('cutoffs.gc.txt','w') print >> out, '\n'.join(at_cutoffs) out.close() ############################################################ # count_at # # Count A's and T's in the given sequence ############################################################ def count_at(seq): return len([nt for nt in seq if nt in ['A','T']]) ############################################################ # __main__ ############################################################ if __name__ == '__main__': main()