Mercurial > repos > xuebing > sharplabtool
comparison tools/ilmn_pacbio/cov_model.py @ 0:9071e359b9a3
Uploaded
| author | xuebing |
|---|---|
| date | Fri, 09 Mar 2012 19:37:19 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:9071e359b9a3 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 from optparse import OptionParser, SUPPRESS_HELP | |
| 3 import os, random, quake | |
| 4 | |
| 5 ############################################################ | |
| 6 # cov_model.py | |
| 7 # | |
| 8 # Given a file of kmer counts, reports the cutoff to use | |
| 9 # to separate trusted/untrusted kmers. | |
| 10 ############################################################ | |
| 11 | |
| 12 ############################################################ | |
| 13 # main | |
| 14 ############################################################ | |
| 15 def main(): | |
| 16 usage = 'usage: %prog [options] <counts file>' | |
| 17 parser = OptionParser(usage) | |
| 18 parser.add_option('--int', dest='count_kmers', action='store_true', default=False, help='Kmers were counted as integers w/o the use of quality values [default: %default]') | |
| 19 parser.add_option('--ratio', dest='ratio', type='int', default=200, help='Likelihood ratio to set trusted/untrusted cutoff [default: %default]') | |
| 20 parser.add_option('--no_sample', dest='no_sample', action='store_true', default=False, help='Do not sample kmer coverages into kmers.txt because its already done [default: %default]') | |
| 21 # help='Model kmer coverage as a function of GC content of kmers [default: %default]' | |
| 22 parser.add_option('--gc', dest='model_gc', action='store_true', default=False, help=SUPPRESS_HELP) | |
| 23 (options, args) = parser.parse_args() | |
| 24 | |
| 25 if len(args) != 1: | |
| 26 parser.error('Must provide kmers counts file') | |
| 27 else: | |
| 28 ctsf = args[0] | |
| 29 | |
| 30 if options.count_kmers: | |
| 31 model_cutoff(ctsf, options.ratio) | |
| 32 print 'Cutoff: %s' % open('cutoff.txt').readline().rstrip() | |
| 33 | |
| 34 else: | |
| 35 if options.model_gc: | |
| 36 model_q_gc_cutoffs(ctsf, 25000, options.ratio) | |
| 37 else: | |
| 38 model_q_cutoff(ctsf, 50000, options.ratio, options.no_sample) | |
| 39 print 'Cutoff: %s' % open('cutoff.txt').readline().rstrip() | |
| 40 | |
| 41 | |
| 42 ############################################################ | |
| 43 # model_cutoff | |
| 44 # | |
| 45 # Make a histogram of kmers to give to R to learn the cutoff | |
| 46 ############################################################ | |
| 47 def model_cutoff(ctsf, ratio): | |
| 48 # make kmer histogram | |
| 49 cov_max = 0 | |
| 50 for line in open(ctsf): | |
| 51 cov = int(line.split()[1]) | |
| 52 if cov > cov_max: | |
| 53 cov_max = cov | |
| 54 | |
| 55 kmer_hist = [0]*cov_max | |
| 56 for line in open(ctsf): | |
| 57 cov = int(line.split()[1]) | |
| 58 kmer_hist[cov-1] += 1 | |
| 59 | |
| 60 cov_out = open('kmers.hist', 'w') | |
| 61 for cov in range(0,cov_max): | |
| 62 if kmer_hist[cov]: | |
| 63 print >> cov_out, '%d\t%d' % (cov+1,kmer_hist[cov]) | |
| 64 cov_out.close() | |
| 65 | |
| 66 os.system('R --slave --args %d < %s/cov_model.r 2> r.log' % (ratio,quake.quake_dir)) | |
| 67 | |
| 68 | |
| 69 ############################################################ | |
| 70 # model_q_cutoff | |
| 71 # | |
| 72 # Sample kmers to give to R to learn the cutoff | |
| 73 # 'div100' is necessary when the number of kmers is too | |
| 74 # large for random.sample, so we only consider every 100th | |
| 75 # kmer. | |
| 76 ############################################################ | |
| 77 def model_q_cutoff(ctsf, sample, ratio, no_sample=False): | |
| 78 if not no_sample: | |
| 79 # count number of kmer coverages | |
| 80 num_covs = 0 | |
| 81 for line in open(ctsf): | |
| 82 num_covs += 1 | |
| 83 | |
| 84 # choose random kmer coverages | |
| 85 div100 = False | |
| 86 if sample >= num_covs: | |
| 87 rand_covs = range(num_covs) | |
| 88 else: | |
| 89 if num_covs > 1000000000: | |
| 90 div100 = True | |
| 91 rand_covs = random.sample(xrange(num_covs/100), sample) | |
| 92 else: | |
| 93 rand_covs = random.sample(xrange(num_covs), sample) | |
| 94 rand_covs.sort() | |
| 95 | |
| 96 # print to file | |
| 97 out = open('kmers.txt', 'w') | |
| 98 kmer_i = 0 | |
| 99 rand_i = 0 | |
| 100 for line in open(ctsf): | |
| 101 if div100: | |
| 102 if kmer_i % 100 == 0 and kmer_i/100 == rand_covs[rand_i]: | |
| 103 print >> out, line.split()[1] | |
| 104 rand_i += 1 | |
| 105 if rand_i >= sample: | |
| 106 break | |
| 107 else: | |
| 108 if kmer_i == rand_covs[rand_i]: | |
| 109 print >> out, line.split()[1] | |
| 110 rand_i += 1 | |
| 111 if rand_i >= sample: | |
| 112 break | |
| 113 kmer_i += 1 | |
| 114 out.close() | |
| 115 | |
| 116 os.system('R --slave --args %d < %s/cov_model_qmer.r 2> r.log' % (ratio,quake.quake_dir)) | |
| 117 | |
| 118 | |
| 119 ############################################################ | |
| 120 # model_q_gc_cutoffs | |
| 121 # | |
| 122 # Sample kmers to give to R to learn the cutoff for each | |
| 123 # GC value | |
| 124 ############################################################ | |
| 125 def model_q_gc_cutoffs(ctsf, sample, ratio): | |
| 126 # count number of kmer coverages at each at | |
| 127 k = len(open(ctsf).readline().split()[0]) | |
| 128 num_covs_at = [0]*(k+1) | |
| 129 for line in open(ctsf): | |
| 130 kmer = line.split()[0] | |
| 131 num_covs_at[count_at(kmer)] += 1 | |
| 132 | |
| 133 # for each AT bin | |
| 134 at_cutoffs = [] | |
| 135 for at in range(1,k): | |
| 136 # sample covs | |
| 137 if sample >= num_covs_at[at]: | |
| 138 rand_covs = range(num_covs_at[at]) | |
| 139 else: | |
| 140 rand_covs = random.sample(xrange(num_covs_at[at]), sample) | |
| 141 rand_covs.sort() | |
| 142 | |
| 143 # print to file | |
| 144 out = open('kmers.txt', 'w') | |
| 145 kmer_i = 0 | |
| 146 rand_i = 0 | |
| 147 for line in open(ctsf): | |
| 148 (kmer,cov) = line.split() | |
| 149 if count_at(kmer) == at: | |
| 150 if kmer_i == rand_covs[rand_i]: | |
| 151 print >> out, cov | |
| 152 rand_i += 1 | |
| 153 if rand_i >= sample: | |
| 154 break | |
| 155 kmer_i += 1 | |
| 156 out.close() | |
| 157 | |
| 158 os.system('R --slave --args %d < %s/cov_model_qmer.r 2> r%d.log' % (ratio,quake.quake_dir,at)) | |
| 159 | |
| 160 at_cutoffs.append( open('cutoff.txt').readline().rstrip() ) | |
| 161 if at in [1,k-1]: # setting extremes to next closests | |
| 162 at_cutoffs.append( open('cutoff.txt').readline().rstrip() ) | |
| 163 | |
| 164 os.system('mv kmers.txt kmers.at%d.txt' % at) | |
| 165 os.system('mv cutoff.txt cutoff.at%d.txt' % at) | |
| 166 | |
| 167 out = open('cutoffs.gc.txt','w') | |
| 168 print >> out, '\n'.join(at_cutoffs) | |
| 169 out.close() | |
| 170 | |
| 171 | |
| 172 ############################################################ | |
| 173 # model_q_gc_cutoffs_bigmem | |
| 174 # | |
| 175 # Sample kmers to give to R to learn the cutoff for each | |
| 176 # GC value | |
| 177 ############################################################ | |
| 178 def model_q_gc_cutoffs_bigmem(ctsf, sample, ratio): | |
| 179 # input coverages | |
| 180 k = 0 | |
| 181 for line in open(ctsf): | |
| 182 (kmer,cov) = line.split() | |
| 183 if k == 0: | |
| 184 k = len(kmer) | |
| 185 at_covs = ['']*(k+1) | |
| 186 else: | |
| 187 at = count_at(kmer) | |
| 188 if at_covs[at]: | |
| 189 at_covs[at].append(cov) | |
| 190 else: | |
| 191 at_covs[at] = [cov] | |
| 192 | |
| 193 for at in range(1,k): | |
| 194 print '%d %d' % (at,len(at_covs[at])) | |
| 195 | |
| 196 # for each AT bin | |
| 197 at_cutoffs = [] | |
| 198 for at in range(1,k): | |
| 199 # sample covs | |
| 200 if sample >= len(at_covs[at]): | |
| 201 rand_covs = at_covs[at] | |
| 202 else: | |
| 203 rand_covs = random.sample(at_covs[at], sample) | |
| 204 | |
| 205 # print to file | |
| 206 out = open('kmers.txt', 'w') | |
| 207 for rc in rand_covs: | |
| 208 print >> out, rc | |
| 209 out.close() | |
| 210 | |
| 211 os.system('R --slave --args %d < %s/cov_model_qmer.r 2> r%d.log' % (ratio,quake.quake_dir,at)) | |
| 212 | |
| 213 at_cutoffs.append( open('cutoff.txt').readline().rstrip() ) | |
| 214 if at in [1,k-1]: # setting extremes to next closests | |
| 215 at_cutoffs.append( open('cutoff.txt').readline().rstrip() ) | |
| 216 | |
| 217 os.system('mv kmers.txt kmers.at%d.txt' % at) | |
| 218 os.system('mv cutoff.txt cutoff.at%d.txt' % at) | |
| 219 | |
| 220 out = open('cutoffs.gc.txt','w') | |
| 221 print >> out, '\n'.join(at_cutoffs) | |
| 222 out.close() | |
| 223 | |
| 224 | |
| 225 ############################################################ | |
| 226 # count_at | |
| 227 # | |
| 228 # Count A's and T's in the given sequence | |
| 229 ############################################################ | |
| 230 def count_at(seq): | |
| 231 return len([nt for nt in seq if nt in ['A','T']]) | |
| 232 | |
| 233 | |
| 234 ############################################################ | |
| 235 # __main__ | |
| 236 ############################################################ | |
| 237 if __name__ == '__main__': | |
| 238 main() |
